Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
model-combine-shards.C
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
35 
36 #include <cstdio>
37 #include <iostream>
38 #include <string>
39 #include <memory>
40 #include <math.h>
41 #include <unistd.h>
42 #include "../proto/data.pb.h"
43 #include "../proto/dataio.h"
44 #include "../proto/model.pb.h"
45 #include "../utils/kdebug.h"
46 #include "candidate.H"
47 #include "candidate-set.H"
48 #include "candidate-set-iterator.H"
49 #include "candidate-set-reader.H"
50 #include "factory.H"
51 #include "model-merge-reducer.H"
52 #include "model-reader.H"
53 
54 #define DEFAULT_MODEL_PROTO_READER_SPEC "PerceptronModelProtoReader()"
55 
56 using namespace std;
57 using namespace reranker;
58 using confusion_learning::FeatureMessage;
59 using confusion_learning::ModelMessage;
60 
61 int main(int argc, char* argv[]) {
62  int option_char;
63  bool use_integer_feats = false;
64  string output_name;
65  string devtest_filename;
66  int max_examples_to_read = -1;
67 
68  // Invokes member function `int operator ()(void);'
69  while ((option_char = getopt(argc, argv, "Io:d:n:")) != EOF) {
70  switch (option_char) {
71  case 'I':
72  use_integer_feats = true;
73  break;
74  case 'o':
75  output_name = optarg;
76  break;
77  case 'd':
78  devtest_filename = optarg;
79  break;
80  case 'n':
81  max_examples_to_read = atoi(optarg);
82  break;
83  case '?':
84  cerr << "usage: " << argv[0]
85  << " [-E] [-I] [-d <devtest examples>] [-o <output file>]"
86  << endl;
87  cerr << "-E - normalize with the total number of errors" << endl;
88  cerr << "-I - use integer feature id's from proto" << endl;
89  return -1;
90  break;
91  }
92  }
93 
94  ModelMessage model_with_feats;
95  // Process each of the input records. This reducer assumes that the input is
96  // <FeatureId, encoded FeatureMessage> pair per line in the following format:
97  // FeatureString | EncodedMsg \n
98  ConfusionProtoIO reader;
99  ConfusionProtoIO* writer;
100  if (output_name.empty()) {
101  writer = new ConfusionProtoIO("", ConfusionProtoIO::WRITESTD, false, true);
102  } else {
103  writer = new ConfusionProtoIO(output_name, ConfusionProtoIO::WRITE, true,
104  true);
105  }
106  while (cin) {
107  // Process input.
108  string input_data;
109  getline(cin, input_data);
110  if (input_data.empty()) {
111  break;
112  }
113  int delim_pos = input_data.find('\t');
114  string feat_id = input_data.substr(0, delim_pos);
115  string value = input_data.substr(delim_pos + 1);
116 
117  if (feat_id.compare(ModelInfoReducer::kModelMessageFeatureName) == 0) {
118  if (model_with_feats.num_iterations() > 0) {
119  cerr << "Merging in more than one model message." << endl;
120  return -1;
121  }
122  ModelMessage new_model;
123  if (!reader.DecodeBase64(value, &new_model)) {
124  cerr << "Error decoding message: " << value.c_str() << endl;
125  }
126  // Output model message
127  model_with_feats.MergeFrom(new_model);
128  writer->Write(new_model);
129  } else {
130  FeatureMessage* feat =
131  model_with_feats.mutable_raw_parameters()->add_feature();
132  if (!reader.DecodeBase64(value, feat)) {
133  cerr << "Error decoding message: " << value.c_str() << endl;
134  }
135  }
136  }
137  if (model_with_feats.raw_parameters().feature_size() == 0) {
138  cerr << "Empty model, nothing to output." << endl;
139  return -1;
140  }
141  // Normalize the feature values.
142  for (int fix = 0; fix < model_with_feats.raw_parameters().feature_size();
143  ++fix) {
144  FeatureMessage* feat =
145  model_with_feats.mutable_raw_parameters()->mutable_feature(fix);
146  if (!isfinite(feat->value()) || !isfinite(feat->avg_value())) {
147  cerr << "WARNING: feature " << feat->name() << " (ID:"
148  << feat->id() << ") has non-finite value." << endl;
149  } else {
150  if (model_with_feats.training_errors() > 0) {
151  feat->set_value(feat->value() / model_with_feats.training_errors());
152  feat->set_avg_value(feat->avg_value() / model_with_feats.training_errors());
153  if (!isfinite(feat->value()) || !isfinite(feat->avg_value())) {
154  cerr << "WARNING: after error normalization, feature "
155  << feat->name() << " (ID:" << feat->id()
156  << ") has non-finite value." << endl;
157  }
158  }
159  }
160  writer->Write(*feat);
161  }
162  delete writer;
163 
164  double loss = 0.0;
165  if (! devtest_filename.empty()) {
166  // Evaluate model.
167  ModelReader model_reader(1);
168  shared_ptr<Model> model = model_reader.Read(model_with_feats);
169 
170  vector<shared_ptr<CandidateSet> > devtest_examples;
171  CandidateSetReader csr(max_examples_to_read, -1, 1000);
172  csr.set_verbosity(1);
173  csr.Read(devtest_filename, true, true, true, devtest_examples);
175  CandidateSetVectorIt;
176  CandidateSetIterator *devtest_it =
177  new CandidateSetVectorIt(devtest_examples);
178  model->NewEpoch(); // sets epoch to 0
179  model->Evaluate(*devtest_it);
180  loss = model->loss_per_epoch().back();
181  delete devtest_it;
182  }
183  cout << loss << endl;
184 
185  return 0;
186 }
int main(int argc, char *argv[])
Provides the reranker::Candidate class for representing a candidate hypothesis from an initial model...
An interface specifying iteration over CandidateSet instances, using Java-style semantics (sorry...
Provides an interface and some implementations for iterating over CandidateSet instances.
Provides the ModelReader class, which can create Model instances from a file.
shared_ptr< Model > Read(const string &filename, bool compressed, bool use_base64)
Definition: model-reader.H:59
void set_verbosity(int verbosity)
Sets the verbosity of this reader (mostly for debugging purposes).
Class for reading streams of training or test instances, where each training or test instance is a re...
Reducer classes for trainer.
float loss
Definition: hadoop-run.py:389
void Read(const string &filename, bool compressed, bool use_base64, bool reset_counters, vector< shared_ptr< CandidateSet > > &examples)
Reads a stream of CandidateSet instances from the specified file or from standard input...
An implementation of the CandidateSetIterator interface that is backed by an arbitrary C++ collection...
A class for reading streams of training or test instances, where each training or test instance is a ...
Provides a generic dynamic object factory.
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
Knows how to create Model instances that have been serialized to a file.
Definition: model-reader.H:55