refr/run-model_8_c_source.html

 // Copyright 2012, Google Inc.

 // All rights reserved.

 //

 // Redistribution and use in source and binary forms, with or without

 // modification, are permitted provided that the following conditions are

 // met:

 //

 //   * Redistributions of source code must retain the above copyright

 //     notice, this list of conditions and the following disclaimer.

 //   * Redistributions in binary form must reproduce the above

 //     copyright notice, this list of conditions and the following disclaimer

 //     in the documentation and/or other materials provided with the

 //     distribution.

 //   * Neither the name of Google Inc. nor the names of its

 //     contributors may be used to endorse or promote products derived from

 //     this software without specific prior written permission.

 //

 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // -----------------------------------------------------------------------------

 //

 //


 #include <iostream>

 #include <fstream>

 #include <string>

 #include <cstdlib>

 #include <memory>

 #include <vector>


 #include "candidate.H"

 #include "candidate-set.H"

 #include "candidate-set-reader.H"

 #include "candidate-set-writer.H"

 #include "executive-feature-extractor.H"

 #include "interpreter.H"

 #include "model.H"

 #include "model-merge-reducer.H"

 #include "model-reader.H"

 #include "model-proto-writer.H"

 #include "perceptron-model.H"

 #include "symbol-table.H"


 #define DEBUG 0


 #define PROG_NAME "run-model"


 #define DEFAULT_MAX_EXAMPLES -1

 #define DEFAULT_MAX_CANDIDATES -1

 #define DEFAULT_MODEL_CONFIG "PerceptronModel(name(\"MyPerceptronModel\"))"

 #define DEFAULT_REPORTING_INTERVAL 1000

 #define DEFAULT_COMPACTIFY_INTERVAL 10000

 #define DEFAULT_USE_WEIGHTED_LOSS true


 // We use two levels of macros to get the string version of an int constant.

 #define XSTR(arg) STR(arg)

 #define STR(arg) #arg


 using namespace std;

 using namespace reranker;


 const char *usage_msg[] = {

   "Usage:\n",

   PROG_NAME " --config <master config file>\n",

   "\t-m|--model-file <model file> [--model-config <model config>]\n"

   "\t[-t|--train <training input file>+ [-i <input model file>] [--mapper] ]\n",

   "\t-d|--devtest <devtest input file>+\n",

   "\t[-o|--output <candidate set output file>]\n",

   "\t[-h <hyp output file>] [--scores <score output file>]\n",

   "\t[--train-config <training feature extractor config file>]\n",

   "\t[--dev-config <devtest feature extractor config file>]\n",

   "\t[--compactify-feature-uids]\n",

   "\t[-s|--streaming [--compactify-interval <interval>] ] [-u]\n",

   "\t[--no-base64]\n",

   "\t[--min-epochs <min epochs>] [--max-epochs <max epochs>]\n",

   "\t[--max-examples <max num examples>]\n",

   "\t[--max-candidates <max num candidates>]\n",

   "\t[-r <reporting interval>] [ --use-weighted-loss[=][true|false] ]\n",

   "where\n",

   "\t<master config file> is a file in the interpreted factory language\n",

   "\t\tcapable of specifying all options to this executable (see\n",

   "\t\tconfig/default.infact for an example of the default options)\n",

   "\t<model file> is the name of the file to which to write out a\n",

   "\t\tnewly-trained model when training (one or more\n",

   "\t\t<training input file>'s specified), or the name of a file\n",

   "\t\tfrom which to load a serialized model when decoding\n",

   "\t<input model file> is an optional input model file as a starting\n",

   "\t\tmodel when training\n",

   "\t<model config> is the optional configuration string for constructing\n",

   "\t\ta new Model instance\n",

   "\t\t(defaults to \"" DEFAULT_MODEL_CONFIG "\")\n",

   "\t<training input file> is the name of a stream of serialized\n",

   "\t\tCandidateSet instances, or \"-\" for input from standard input\n",

   "\t--mapper specifies to train a single epoch and output features to\n",

   "\t\tstandard output\n",

   "\t<devtest input file> is the name of a stream of serialized\n",

   "\t\tCandidateSet instances, or \"-\" for input from standard input\n",

   "\t\t(required unless training in mapper mode)\n",

   "\t<candidate set output file> is the name of the file to which to output\n",

   "\t\tcandidate sets that have been scored by the model (in\n",

   "\t\tdecoding mode)\n",

   "\t<training feature extractor config file> is the name of a configuration\n",

   "\t\tfile to be read by the ExecutiveFeatureExtractor instance\n"

   "\t\textracting features on training examples\n",

   "\t<devtest feature extractor config file> is the name of a configuration\n",

   "\t\tfile to be read by the ExecutiveFeatureExtractor instance\n",

   "\t\textracting features on devtest examples\n",

   "\t--compactify-feature-uids specifies to re-map all feature uids to the\n",

   "\t\t[0,n-1] interval, where n is the number of non-zero features\n",

   "\t--streaming specifies to train in streaming mode (i.e., do not\n",

   "\t\tread in all training instances into memory)\n",

   "\t--compactify-interval specifies the interval after which to compactify\n",

   "\t\tfeature uid's and remove unused symbols (only available when\n",

   "\t\ttraining in streaming mode; defaults to "

   XSTR(DEFAULT_COMPACTIFY_INTERVAL) ")\n",

   "\t-u specifies that the input files are uncompressed\n",

   "\t--no-base64 specifies not to use base64 encoding/decoding\n",

   "\t--max-examples specifies the maximum number of examples to read from\n",

   "\t\tany input file (defaults to " XSTR(DEFAULT_MAX_EXAMPLES) ")\n",

   "\t--max-candidates specifies the maximum number of candidates to read\n",

   "\t\tfor any candidate set (defaults to " XSTR(DEFAULT_MAX_CANDIDATES) ")\n",

   "\t-r specifies the interval at which the CandidateSetReader reports how\n",

   "\t\tmany candidate sets it has read (defaults to "

   XSTR(DEFAULT_REPORTING_INTERVAL) ")\n",

   "\t--use-weighted-loss specifies whether to weight losses on devtest\n",

   "\t\texamples by the number of tokens in the reference, where, e.g.,\n",

   "\t\tweighted loss is appropriate for computing WER, but not BLEU\n",

   "\t\t(defaults to " XSTR(DEFAULT_USE_WEIGHTED_LOSS) ")\n"

 };


 void usage() {

   int usage_msg_len = sizeof(usage_msg)/sizeof(const char *);

   for (int i = 0; i < usage_msg_len; ++i) {

     cout << usage_msg[i];

   }

   cout.flush();

 }


 bool check_for_required_arg(int argc, int i, string err_msg) {

   if (i + 1 >= argc) {

     cerr << PROG_NAME << ": error: " << err_msg << endl;

     usage();

     return false;

   } else {

     return true;

   }

 }


 void read_and_extract_features(const vector<string> &files,

                                CandidateSetReader &csr,

                                bool compressed,

                                bool use_base64,

                                shared_ptr<ExecutiveFeatureExtractor> efe,

                                vector<shared_ptr<CandidateSet> > &examples) {

   bool reset_counters = true;

   for (vector<string>::const_iterator file_it = files.begin();

        file_it != files.end();

        ++file_it) {

     csr.Read(*file_it, compressed, use_base64, reset_counters, examples);

   }

   if (efe.get() != NULL) {

     // Extract features for CandidateSet instances in situ.

     for (vector<shared_ptr<CandidateSet> >::iterator it = examples.begin();

          it != examples.end();

          ++it) {

       efe->Extract(*(*it));

     }

   }

 }


 int

 main(int argc, char **argv) {

   // Master configuration file.

   string master_config_file;

   // Required parameters.

   string model_file;

   string input_model_file;

   string model_config = DEFAULT_MODEL_CONFIG;

   vector<string> training_files;

   vector<string> devtest_files;

   bool mapper_mode = false;

   string output_file;

   string hyp_output_file;

   string score_output_file;

   string training_feature_extractor_config_file;

   string devtest_feature_extractor_config_file;

   bool compressed = true;

   bool use_base64 = true;

   bool streaming = false;

   bool use_weighted_loss = DEFAULT_USE_WEIGHTED_LOSS;

   string use_weighted_loss_arg_prefix = "--use-weighted-loss";

   size_t use_weighted_loss_arg_prefix_len =

       use_weighted_loss_arg_prefix.length();

   bool compactify_feature_uids = false;

   int compactify_interval = DEFAULT_COMPACTIFY_INTERVAL;

   int min_epochs = -1;

   int max_epochs = -1;

   int max_examples = DEFAULT_MAX_EXAMPLES;

   int max_candidates = DEFAULT_MAX_CANDIDATES;

   int reporting_interval = DEFAULT_REPORTING_INTERVAL;


   shared_ptr<Model> model;

   shared_ptr<ExecutiveFeatureExtractor> training_efe;

   shared_ptr<ExecutiveFeatureExtractor> devtest_efe;


   // Preprocess options, looking for the --config option which specifies

   // a master configuration file.  This file should be used before

   // any other command line options, which may be used to override anything

   // set in the master configuration file.

   int master_config_arg_idx = -1;

   for (int i = 1; i < argc; ++i) {

     string arg = argv[i];

     if (arg == "--config") {

       string err_msg =

           string("no master configuration file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       master_config_file = argv[++i];

       master_config_arg_idx = i - 1;

     }

   }

   if (master_config_file != "") {

     Interpreter i;

     cerr << "Reading options from \"" << master_config_file << "\"." << endl;

     i.Eval(master_config_file);

     // Now, grab all variables that could be set and assign them to local

     // variables.  Note that the Interpreter::Get method only assigns a value

     // to its second argument if it exists in the interpreter's environment.

     i.Get("model_file", &model_file);

     i.Get("model", &model);

     i.Get("mapper_mode", &mapper_mode);

     i.Get("training_files", &training_files);

     i.Get("devtest_files", &devtest_files);

     i.Get("output_file", &output_file);

     i.Get("hyp_output_file", &hyp_output_file);

     i.Get("training_efe", &training_efe);

     i.Get("devtest_efe", &devtest_efe);

     i.Get("compactify_feature_uids", &compactify_feature_uids);

     i.Get("compactify_interval", &compactify_interval);

     i.Get("streaming", &streaming);

     i.Get("compressed", &compressed);

     i.Get("use_base64", &use_base64);

     i.Get("min_epochs", &min_epochs);

     i.Get("max_epochs", &max_epochs);

     i.Get("max_examples", &max_examples);

     i.Get("max_candidates", &max_candidates);

     i.Get("reporting_interval", &reporting_interval);

     i.Get("use_weighted_loss", &use_weighted_loss);

   }


   // Process options.  The majority of code in this file is devoted to this.

   for (int i = 1; i < argc; ++i) {

     if (i == master_config_arg_idx) {

       ++i;

       continue;

     }

     string arg = argv[i];

     if (arg == "-m" || arg == "-model" || arg == "--model") {

       string err_msg = string("no model file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       model_file = argv[++i];

     } else if (arg == "-i" || arg == "--i") {

       string err_msg = string("no input model file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       input_model_file = argv[++i];

     } else if (arg == "-model-config" || arg == "--model-config") {

       string err_msg =

           string("no model configuration string specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       model_config = argv[++i];

     } else if (arg == "-t" || arg == "-train" || arg == "--train") {

       string err_msg = string("no input files specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       // Keep reading args until next option or until no more args.

       ++i;

       for ( ; i < argc; ++i) {

         if (argv[i][0] == '-' && strlen(argv[i]) > 1) {

           --i;

           break;

         }

         training_files.push_back(argv[i]);

       }

     } else if (arg == "-mapper" || arg == "--mapper") {

       mapper_mode = true;

     } else if (arg == "-d" || arg == "-devtest" || arg == "--devtest") {

       string err_msg = string("no input files specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       // Keep reading args until next option or until no more args.

       ++i;

       for ( ; i < argc; ++i) {

         if (argv[i][0] == '-') {

           --i;

           break;

         }

         devtest_files.push_back(argv[i]);

       }

     } else if (arg == "-o" || arg == "-output" || arg == "--output") {

       string err_msg = string("no output file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       output_file = argv[++i];

     } else if (arg == "-h") {

       string err_msg =

           string("no hypothesis output file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       hyp_output_file = argv[++i];

     } else if (arg == "-scores" || arg == "--scores") {

       string err_msg =

           string("no score output file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       score_output_file = argv[++i];

     } else if (arg == "-train-config" || arg == "--train-config") {

       string err_msg =

           string("no feature extractor config file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       training_feature_extractor_config_file = argv[++i];

     } else if (arg == "-dev-config" || arg == "--dev-config") {

       string err_msg =

           string("no feature extractor config file specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       devtest_feature_extractor_config_file = argv[++i];

     } else if (arg == "-compactify-feature-uids" ||

                arg == "--compactify-feature-uids") {

       compactify_feature_uids = true;

     } else if (arg == "-s" || arg == "-streaming" || arg == "--streaming") {

       streaming = true;

     } else if (arg == "--compactify-interval") {

       string err_msg = string("no interval specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       compactify_interval = atoi(argv[++i]);

     } else if (arg == "-u") {

       compressed = false;

     } else if (arg == "--no-base64") {

       use_base64 = false;

     } else if (arg == "-min-epochs" || arg == "--min-epochs") {

       string err_msg = string("no arg specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       min_epochs = atoi(argv[++i]);

     } else if (arg == "-max-epochs" || arg == "--max-epochs") {

       string err_msg = string("no arg specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       max_epochs = atoi(argv[++i]);

     } else if (arg == "-max-examples" || arg == "--max-examples") {

       string err_msg = string("no arg specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       max_examples = atoi(argv[++i]);

     } else if (arg == "-max-candidates" || arg == "--max-candidates") {

       string err_msg = string("no arg specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       max_candidates = atoi(argv[++i]);

     } else if (arg == "-r") {

       string err_msg = string("no arg specified with ") + arg;

       if (!check_for_required_arg(argc, i, err_msg)) {

         return -1;

       }

       reporting_interval = atoi(argv[++i]);

     } else if (arg.substr(0, use_weighted_loss_arg_prefix_len) ==

                use_weighted_loss_arg_prefix) {

       string use_weighted_loss_str;

       if (arg.length() > use_weighted_loss_arg_prefix_len &&

           arg[use_weighted_loss_arg_prefix_len] == '=') {

         use_weighted_loss_str =

             arg.substr(use_weighted_loss_arg_prefix_len + 1);

       } else {

         string err_msg =

             string("no \"true\" or \"false\" arg specified with ") + arg;

         if (!check_for_required_arg(argc, i, err_msg)) {

           return -1;

         }

         use_weighted_loss_str = argv[++i];

       }

       if (use_weighted_loss_str != "true" &&

           use_weighted_loss_str != "false") {

         cerr << PROG_NAME << ": error: must specify \"true\" or \"false\""

              << " with --use-weighted-loss" << endl;

         usage();

         return -1;

       }

       if (use_weighted_loss_str != "true") {

         use_weighted_loss = false;

       }

     } else if (arg.size() > 0 && arg[0] == '-') {

       cerr << PROG_NAME << ": error: unrecognized option: " << arg << endl;

       usage();

       return -1;

     }

   }


   bool training = training_files.size() > 0;


   // Check that user specified required args.

   if (model_file == "") {

     cerr << PROG_NAME << ": error: must specify model file" << endl;

     usage();

     return -1;

   }

   if (!mapper_mode && devtest_files.size() == 0) {

     cerr << PROG_NAME << ": error: must specify devtest input files when "

          << "not in mapper mode" << endl;

     usage();

     return -1;

   }

   if (output_file != "" && training) {

     cerr << PROG_NAME << ": error: cannot specify output file when training"

          << endl;

     usage();

     return -1;

   }

   if (hyp_output_file != "" && training) {

     cerr << PROG_NAME

          << ": error: cannot specify hypothesis output file when training"

          << endl;

     usage();

     return -1;

   }

   bool reading_from_stdin = false;

   for (vector<string>::const_iterator training_file_it = training_files.begin();

        training_file_it != training_files.end();

        ++training_file_it) {

     if (*training_file_it == "-") {

       reading_from_stdin = true;

       break;

     }

   }

   if (training_files.size() > 1 && reading_from_stdin) {

     cerr << PROG_NAME << ": error: cannot read from standard input and "

          << "specify other training files" << endl;

     usage();

     return -1;

   }

   if (!training && input_model_file != "") {

     cerr << PROG_NAME << ": error: can only specify <input model file> "

          << "when in training mode" << endl;

     usage();

     return -1;

   }


   // Now, we finally get to the meat of the code for this executable.

   if (training_feature_extractor_config_file != "") {

     training_efe = ExecutiveFeatureExtractor::InitFromSpec(

         training_feature_extractor_config_file);

   }

   if (devtest_feature_extractor_config_file != "") {

     devtest_efe = ExecutiveFeatureExtractor::InitFromSpec(

         devtest_feature_extractor_config_file);

   }


   CandidateSetReader csr(max_examples, max_candidates, reporting_interval);

   csr.set_verbosity(1);


   Factory<Model> model_factory;


   if (!training || input_model_file != "") {

     // We're here because we're not training, or else we are training and

     // the user specified an input model file.

     string model_file_to_load = training ? input_model_file : model_file;


     ModelReader model_reader(1);

     model = model_reader.Read(model_file_to_load, compressed, use_base64);

   } else {

     // First, see if model_config is the name of a file.

     ifstream model_config_is(model_config.c_str());

     if (model_config_is) {

       cerr << "Reading model config from file \"" << model_config << "\"."

            << endl;

     }


     StreamTokenizer *st = model_config_is.good() ?

         new StreamTokenizer(model_config_is) :

         new StreamTokenizer(model_config);

     model = model_factory.CreateOrDie(*st);

     delete st;

   }

   if (model.get() == NULL) {

     return -1;

   }


   Factory<ModelProtoWriter> proto_writer_factory;

   shared_ptr<ModelProtoWriter> model_writer =

       proto_writer_factory.CreateOrDie(model->proto_writer_spec(),

                                        "model proto writer");

   if (model_writer.get() == NULL) {

     return -1;

   }


   if (!mapper_mode) {

     model->set_end_of_epoch_hook(new EndOfEpochModelWriter(model_file,

                                                            model_writer,

                                                            compressed,

                                                            use_base64));

   }

   model->set_use_weighted_loss(use_weighted_loss);

   model->set_min_epochs(min_epochs);

   model->set_max_epochs(max_epochs);


   vector<shared_ptr<CandidateSet> > training_examples;

   vector<shared_ptr<CandidateSet> > devtest_examples;

   if (!streaming && !mapper_mode) {

     cerr << "Loading devtest examples." << endl;

     read_and_extract_features(devtest_files, csr, compressed, use_base64,

                               devtest_efe, devtest_examples);

     if (devtest_examples.size() == 0) {

       cerr << "Could not read any devtest examples.  Exiting." << endl;

       return -1;

     }

   }


   typedef CollectionCandidateSetIterator<vector<shared_ptr<CandidateSet> > >

       CandidateSetVectorIt;


   CandidateSetIterator *training_it;

   CandidateSetIterator *devtest_it;


   if (training_files.size() > 0) {

     cerr << "Training." << endl;

     if (streaming) {

       training_it = new MultiFileCandidateSetIterator(training_files,

                                                       training_efe,

                                                       max_examples,

                                                       max_candidates,

                                                       reporting_interval,

                                                       1,

                                                       compressed, use_base64);

       devtest_it = new MultiFileCandidateSetIterator(devtest_files,

                                                      devtest_efe,

                                                      max_examples,

                                                      max_candidates,

                                                      reporting_interval,

                                                      1,

                                                      compressed, use_base64);

       // TODO(dbikel): Make sure to add setter method to Model and

       //               PerceptronModel to tell model to invoke its

       //               CompactifyFeatureUids method after a specified

       //               interval.  This new setter method should only

       //               be invoked here, when in streaming mode.

     } else {

       // Regular, in-memory, non-streaming training.

       read_and_extract_features(training_files, csr, compressed, use_base64,

                                 training_efe, training_examples);

       if (training_examples.size() == 0) {

         cerr << "Could not read any training examples from training files."

              << "  Exiting." << endl;

         return -1;

       }

       csr.ClearStrings();


       training_it = new CandidateSetVectorIt(training_examples);

       devtest_it = new CandidateSetVectorIt(devtest_examples);

     }


     if (mapper_mode) {

       // In mapper mode, train a single epoch, then write out features

       // to stdout, and serialize model.

       model->NewEpoch();

       model->TrainOneEpoch(*training_it);

     } else {

       model->Train(*training_it, *devtest_it);

       delete training_it;

       delete devtest_it;

     }


     if (compactify_feature_uids) {

       cerr << "Compactifying feature uid's...";

       cerr.flush();

       model->CompactifyFeatureUids();

       cerr << "done." << endl;

     }


     // Serialize model.

     cerr << "Writing out model to file \"" << model_file << "\"...";

     cerr.flush();

     confusion_learning::ModelMessage model_message;

     model_writer->Write(model.get(), &model_message, false);


     ConfusionProtoIO* proto_writer;

     if (mapper_mode) {

       cerr << "Writing ModelMessage (without features) and FeatureMessage "

            << "instances to standard output." << endl;

       proto_writer = new ConfusionProtoIO(model_file, ConfusionProtoIO::WRITESTD,

                                           false, use_base64);

       cout << ModelInfoReducer::kModelMessageFeatureName << "\t";

     } else {

       proto_writer = new ConfusionProtoIO(model_file, ConfusionProtoIO::WRITE,

                                           compressed, use_base64);

     }

     proto_writer->Write(model_message);

     // Write out features.

     bool output_best_epoch = !mapper_mode;

     bool output_key = mapper_mode;

     model_writer->WriteFeatures(model.get(),

                                 *(proto_writer->outputstream()),

                                 output_best_epoch,

                                 model->num_training_errors(),

                                 output_key);

     delete proto_writer;

     cerr << "done." << endl;

   } else {

     CandidateSetVectorIt devtest_examples_it(devtest_examples);

     model->NewEpoch(); // sets epoch to 0

     model->Evaluate(devtest_examples_it);


     if (output_file != "") {

       CandidateSetWriter csw;

       csw.set_verbosity(1);

       csw.Write(devtest_examples, output_file, compressed, use_base64);

     }

     bool output_hyps = hyp_output_file != "";

     bool output_scores = score_output_file != "";

     if (output_hyps || output_scores) {

       ofstream hyp_os(hyp_output_file.c_str());

       ofstream score_os(score_output_file.c_str());

       devtest_examples_it.Reset();

       while (devtest_examples_it.HasNext()) {

         CandidateSet &candidate_set = devtest_examples_it.Next();

         if (output_hyps) {

           hyp_os << candidate_set.GetBestScoring().raw_data() << "\n";

         }

         if (output_scores) {

           for (CandidateSet::const_iterator cand_it = candidate_set.begin();

                cand_it != candidate_set.end();

                ++cand_it) {

             score_os << (*cand_it)->score() << "\n";

           }

         }

       }

       if (output_hyps) {

         hyp_os.flush();

       }

       if (output_scores) {

         score_os.flush();

       }

     }

   }

   TearDown();

   google::protobuf::ShutdownProtobufLibrary();

 }


perceptron-model.H
Provides the reranker::PerceptronModel reranker class.

reranker::CandidateSetReader::ClearStrings
void ClearStrings()
Invokes CandidateSetProtoReader::ClearStrings on the internal CandidateSetProtoReader instance...
Definition: candidate-set-reader.H:195

usage_msg
const char * usage_msg[]
Definition: run-model.C:82

candidate.H
Provides the reranker::Candidate class for representing a candidate hypothesis from an initial model...

reranker::CandidateSetIterator
An interface specifying iteration over CandidateSet instances, using Java-style semantics (sorry...
Definition: candidate-set-iterator.H:58

model-reader.H
Provides the ModelReader  class, which can create Model  instances from a file.

reranker::MultiFileCandidateSetIterator
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
Definition: candidate-set-iterator.H:112

reranker::StreamTokenizer
A simple class for tokenizing a stream of tokens for the formally specified language used to construc...
Definition: stream-tokenizer.H:87

reranker::ModelReader::Read
shared_ptr< Model > Read(const string &filename, bool compressed, bool use_base64)
Definition: model-reader.H:59

reranker::CandidateSet::GetBestScoring
const Candidate & GetBestScoring() const
Definition: candidate-set.H:108

symbol-table.H
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...

DEFAULT_MODEL_CONFIG
#define DEFAULT_MODEL_CONFIG
Definition: run-model.C:67

reranker::Candidate::raw_data
const string & raw_data() const
Returns the raw data (typically the sentence) for this candidate.
Definition: candidate.H:143

reranker::CandidateSetReader::set_verbosity
void set_verbosity(int verbosity)
Sets the verbosity of this reader (mostly for debugging purposes).
Definition: candidate-set-reader.H:221

reranker::Interpreter::Eval
void Eval(const string &filename)
Evaluates the statements in the specified text file.
Definition: interpreter.H:180

reranker::CandidateSetWriter::Write
void Write(vector< shared_ptr< CandidateSet > > &examples, const string &filename, bool compressed, bool use_base64)
Writes a stream of CandidateSet instances to the specified file or to standard output.
Definition: candidate-set-writer.H:98

read_and_extract_features
void read_and_extract_features(const vector< string > &files, CandidateSetReader &csr, bool compressed, bool use_base64, shared_ptr< ExecutiveFeatureExtractor > efe, vector< shared_ptr< CandidateSet > > &examples)
Definition: run-model.C:171

XSTR
#define XSTR(arg)
Expands the string value of the specified argument using the STR macro.
Definition: run-model.C:75

DEFAULT_REPORTING_INTERVAL
#define DEFAULT_REPORTING_INTERVAL
Definition: run-model.C:68

reranker::EndOfEpochModelWriter
An end-of-epoch hook for writing out the best model so far to file after each epoch (if the best mode...
Definition: model-proto-writer.H:126

reranker::Factory
Factory for dynamically created instance of the specified type.
Definition: factory.H:396

reranker::CandidateSetWriter::set_verbosity
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes).
Definition: candidate-set-writer.H:172

reranker::CandidateSetWriter
A class for writing streams of training or test instances, where each training or test instance is a ...
Definition: candidate-set-writer.H:62

executive-feature-extractor.H
Provides the reranker::ExecutiveFeatureExtractor class.

candidate-set-reader.H
Class for reading streams of training or test instances, where each training or test instance is a re...

model-merge-reducer.H
Reducer classes for trainer.

reranker::TearDown
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
Definition: feature-extractor.C:42

interpreter.H
Provides an interpreter for assigning primitives and Factory-constructible objects to named variables...

reranker::CandidateSet
A class to hold a set of candidates, either for training or test.
Definition: candidate-set.H:62

check_for_required_arg
bool check_for_required_arg(int argc, int i, string err_msg)
Definition: run-model.C:161

reranker::CandidateSetReader::Read
void Read(const string &filename, bool compressed, bool use_base64, bool reset_counters, vector< shared_ptr< CandidateSet > > &examples)
Reads a stream of CandidateSet instances from the specified file or from standard input...
Definition: candidate-set-reader.H:118

reranker::Factory::CreateOrDie
shared_ptr< T > CreateOrDie(StreamTokenizer &st, Environment *env=NULL)
Dynamically creates an object, whose type and initialization are contained in a specification string...
Definition: factory.H:562

DEFAULT_COMPACTIFY_INTERVAL
#define DEFAULT_COMPACTIFY_INTERVAL
Definition: run-model.C:69

reranker::Interpreter
Provides an interpreter for assigning primitives and Factory-constructible objects to named variables...
Definition: interpreter.H:165

DEFAULT_MAX_EXAMPLES
#define DEFAULT_MAX_EXAMPLES
Definition: run-model.C:65

reranker::CandidateSet::const_iterator
vector< shared_ptr< Candidate > >::const_iterator const_iterator
Definition: candidate-set.H:74

usage
void usage()
Definition: run-model.C:153

PROG_NAME
#define PROG_NAME
Definition: run-model.C:63

DEFAULT_USE_WEIGHTED_LOSS
#define DEFAULT_USE_WEIGHTED_LOSS
Definition: run-model.C:70

reranker::CollectionCandidateSetIterator
An implementation of the CandidateSetIterator interface that is backed by an arbitrary C++ collection...
Definition: candidate-set-iterator.H:88

reranker::CandidateSet::begin
const_iterator begin() const
Definition: candidate-set.H:86

main
int main(int argc, char **argv)
Definition: run-model.C:194

reranker::CandidateSetReader
A class for reading streams of training or test instances, where each training or test instance is a ...
Definition: candidate-set-reader.H:62

model-proto-writer.H
Interface for serializer for reranker::Model instances to ModelMessage instances. ...

candidate-set-writer.H
Class for writing streams of training or test instances, where each training or test instance is a re...

reranker::CandidateSet::end
const_iterator end() const
Definition: candidate-set.H:88

candidate-set.H
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.

DEFAULT_MAX_CANDIDATES
#define DEFAULT_MAX_CANDIDATES
Definition: run-model.C:66

reranker::ModelReader
Knows how to create Model instances that have been serialized to a file.
Definition: model-reader.H:55

model.H
Reranker model interface.

reranker::Interpreter::Get
bool Get(const string &varname, T *value) const
Retrieves the value of the specified variable.
Definition: interpreter.H:217