Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
compile-features.C
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
39 
40 #include <string>
41 #include <cstdlib>
42 #include <memory>
43 #include <vector>
44 
45 #include "../proto/dataio.h"
46 #include "candidate-set.H"
47 #include "candidate-set-iterator.H"
48 #include "candidate-set-writer.H"
50 #include "symbol-table.H"
51 
52 #define PROG_NAME "compile-features"
53 
54 #define DEFAULT_MAX_EXAMPLES -1
55 #define DEFAULT_MAX_CANDIDATES -1
56 #define DEFAULT_REPORTING_INTERVAL 1000
57 
58 // We use two levels of macros to get the string version of an int constant.
59 #define XSTR(arg) STR(arg)
60 #define STR(arg) #arg
61 
62 using namespace std;
63 using namespace reranker;
64 using confusion_learning::SymbolMessage;
65 
66 const char *usage_msg[] = {
67  "Usage:\n",
68  PROG_NAME " -i|--input <candidate set input file>+\n",
69  "\t[-d|--decompile]\n",
70  "\t[--input-symbols <input symbol table>]\n",
71  "\t[--clear-raw]\n",
72  "\t[--max-examples <max num examples>]\n",
73  "\t[--max-candidates <max num candidates>]\n",
74  "\t[-r <reporting interval>]\n",
75  "where\n",
76  "\t<candidate set input file> is the name of a stream of serialized\n",
77  "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
78  "\t<input symbol table> is an optional input file containing a Symbols\n",
79  "\t\tinstance serialized as a sequence of Symbol messages\n",
80  "\t-d|--decompile indicates to decompile features\n",
81  "\t--clear-raw specified to clear each Candidate of its raw data string\n",
82  "\t--max-examples specifies the maximum number of examples to read from\n",
83  "\t\tany input file (defaults to " XSTR(DEFAULT_MAX_EXAMPLES) ")\n",
84  "\t--max-candidates specifies the maximum number of candidates to read\n",
85  "\t\tfor any candidate set (defaults to " XSTR(DEFAULT_MAX_CANDIDATES) ")\n",
86  "\t-r specifies the interval at which the CandidateSetReader reports how\n",
87  "\t\tmany candidate sets it has read (defaults to "
89 };
90 
93 void usage() {
94  int usage_msg_len = sizeof(usage_msg)/sizeof(const char *);
95  for (int i = 0; i < usage_msg_len; ++i) {
96  cout << usage_msg[i];
97  }
98  cout.flush();
99 }
100 
101 bool check_for_required_arg(int argc, int i, string err_msg) {
102  if (i + 1 >= argc) {
103  cerr << PROG_NAME << ": error: " << err_msg << endl;
104  usage();
105  return false;
106  } else {
107  return true;
108  }
109 }
110 
111 int
112 main(int argc, char **argv) {
113  // Required parameters.
114  vector<string> input_files;
115  bool compile_or_decompile = false;
116  bool decompile = false;
117  bool clear_raw = false;
118  string symbol_table_input_file = "";
119  int max_examples = DEFAULT_MAX_EXAMPLES;
120  int max_candidates = DEFAULT_MAX_CANDIDATES;
121  int reporting_interval = DEFAULT_REPORTING_INTERVAL;
122 
123  // Process options. The majority of code in this file is devoted to this.
124  for (int i = 1; i < argc; ++i) {
125  string arg = argv[i];
126  if (arg == "-i" || arg == "-input" || arg == "--input") {
127  string err_msg = string("no input files specified with ") + arg;
128  if (!check_for_required_arg(argc, i, err_msg)) {
129  return -1;
130  }
131  // Keep reading args until next option or until no more args.
132  ++i;
133  for ( ; i < argc; ++i) {
134  if (argv[i][0] == '-' && strlen(argv[i]) > 1) {
135  --i;
136  break;
137  }
138  input_files.push_back(argv[i]);
139  }
140  } else if (arg == "-input-symbols" || arg == "--input-symbols") {
141  string err_msg =
142  string("no symbol table input file specified with ") + arg;
143  if (!check_for_required_arg(argc, i, err_msg)) {
144  return -1;
145  }
146  symbol_table_input_file = argv[++i];
147  compile_or_decompile = true;
148  } else if (arg == "-d" || arg == "-decompile" || arg == "--decompile") {
149  decompile = true;
150  } else if (arg == "-clear-raw" || arg == "--clear-raw") {
151  clear_raw = true;
152  } else if (arg == "-max-examples" || arg == "--max-examples") {
153  string err_msg = string("no arg specified with ") + arg;
154  if (!check_for_required_arg(argc, i, err_msg)) {
155  return -1;
156  }
157  max_examples = atoi(argv[++i]);
158  } else if (arg == "-max-candidates" || arg == "--max-candidates") {
159  string err_msg = string("no arg specified with ") + arg;
160  if (!check_for_required_arg(argc, i, err_msg)) {
161  return -1;
162  }
163  max_candidates = atoi(argv[++i]);
164  } else if (arg == "-r") {
165  string err_msg = string("no arg specified with ") + arg;
166  if (!check_for_required_arg(argc, i, err_msg)) {
167  return -1;
168  }
169  reporting_interval = atoi(argv[++i]);
170  } else if (arg.size() > 0 && arg[0] == '-') {
171  cerr << PROG_NAME << ": error: unrecognized option: " << arg << endl;
172  usage();
173  return -1;
174  }
175  }
176 
177  // Check that user specified required args.
178  if (input_files.size() == 0) {
179  cerr << PROG_NAME << ": error: no candidate set input files specified"
180  << endl;
181  usage();
182  return -1;
183  }
184 
185  if (decompile && !compile_or_decompile) {
186  cerr << PROG_NAME << ": error: cannot specify -d|--decompile without "
187  << "--input-symbols" << endl;
188  usage();
189  return -1;
190  }
191 
192  bool compressed = true;
193  bool uncompressed = false;
194  bool use_base64 = true;
195 
196  // Now, we finally get to the meat of the code for this executable.
197  shared_ptr<Symbols> symbols(new LocalSymbolTable());
198  if (symbol_table_input_file != "") {
199  ConfusionProtoIO proto_reader(symbol_table_input_file,
200  ConfusionProtoIO::READ,
201  compressed, use_base64);
202  SymbolMessage symbol_message;
203  while (proto_reader.Read(&symbol_message)) {
204  symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
205  }
206  proto_reader.Close();
207  }
208 
209  CandidateSetWriter csw;
210 
211  if (compile_or_decompile) {
212  csw.Open("-", uncompressed, use_base64);
213  }
214 
215  int verbosity = 1;
216  shared_ptr<ExecutiveFeatureExtractor> null_efe;
217  MultiFileCandidateSetIterator csi(input_files,
218  null_efe,
219  max_examples,
220  max_candidates,
221  reporting_interval,
222  verbosity,
223  compressed,
224  use_base64);
225 
226  while (csi.HasNext()) {
227  CandidateSet &candidate_set = csi.Next();
228  if (decompile) {
229  candidate_set.DecompileFeatures(symbols.get());
230  } else {
231  // Whether we're in "collect symbols" or "compile features" mode, we
232  // invoke CandidateSet::CompileFeatures, because in it collects
233  // symbols in the symbol table by default as well as compiling features.
234  candidate_set.CompileFeatures(symbols.get());
235  }
236  if (clear_raw) {
237  candidate_set.ClearRawData();
238  }
239  if (compile_or_decompile) {
240  csw.WriteNext(candidate_set);
241  }
242  }
243  if (compile_or_decompile) {
244  csw.Close();
245  } else {
246  // If we're in "collect symbols" mode, write out symbols to cout,
247  // one symbol per line (in plain text).
248  for (Symbols::const_iterator it = symbols->begin();
249  it != symbols->end();
250  ++it) {
251  cout << it->first << "\n";
252  }
253  cout.flush();
254  }
255 
256  TearDown();
257  google::protobuf::ShutdownProtobufLibrary();
258 }
void Open(const string &filename, bool compressed, bool use_base64)
#define DEFAULT_MAX_CANDIDATES
Provides an interface and some implementations for iterating over CandidateSet instances.
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
int main(int argc, char **argv)
#define DEFAULT_REPORTING_INTERVAL
#define XSTR(arg)
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
virtual CandidateSet & Next()
Returns the next CandidateSet.
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set.
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
Definition: symbol-table.H:185
void DecompileFeatures(Symbols *symbols, bool clear_symbolic_features=false, bool clear_features=true, bool force=false)
Decompiles any non-symbolic features in the candidates in this candidate set.
A class for writing streams of training or test instances, where each training or test instance is a ...
Provides the reranker::ExecutiveFeatureExtractor class.
unordered_map< string, int >::const_iterator const_iterator
Definition: symbol-table.H:60
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
A class to hold a set of candidates, either for training or test.
Definition: candidate-set.H:62
bool WriteNext(const CandidateSet &candidate_set)
#define PROG_NAME
#define DEFAULT_MAX_EXAMPLES
bool check_for_required_arg(int argc, int i, string err_msg)
void usage()
Emits usage message to standard output.
virtual bool HasNext() const
Returns whether this iterator contains another CandidateSet.
Class for writing streams of training or test instances, where each training or test instance is a re...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
const char * usage_msg[]