Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
extract-features.C
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
39 
40 #include <string>
41 #include <cstdlib>
42 #include <memory>
43 #include <vector>
44 
45 #include "../proto/dataio.h"
46 #include "candidate-set.H"
47 #include "candidate-set-iterator.H"
48 #include "candidate-set-writer.H"
50 #include "symbol-table.H"
51 
52 #define PROG_NAME "extract-features"
53 
54 #define DEFAULT_MAX_EXAMPLES -1
55 #define DEFAULT_MAX_CANDIDATES -1
56 #define DEFAULT_REPORTING_INTERVAL 1000
57 
58 // We use two levels of macros to get the string version of an int constant.
59 #define XSTR(arg) STR(arg)
60 #define STR(arg) #arg
61 
62 using namespace std;
63 using namespace reranker;
64 using confusion_learning::SymbolMessage;
65 
66 const char *usage_msg[] = {
67  "Usage:\n",
68  PROG_NAME " [-c|--config <feature extractor config file>]\n",
69  "\t-i|--input <candidate set input file>+\n",
70  "\t-o|--output <output directory>\n",
71  "\t[--input-symbols <input symbol table>]\n",
72  "\t[--output-symbols <output symbol table>]\n",
73  "\t[-u] [--no-base64] [--compile] [--clear-raw]\n",
74  "\t[--max-examples <max num examples>]\n",
75  "\t[--max-candidates <max num candidates>]\n",
76  "\t[-r <reporting interval>]\n",
77  "where\n",
78  "\t<feature extractor config file> is the name of a configuration file\n",
79  "\t\tto be read by the ExecutiveFeatureExtractor class\n",
80  "\t<candidate set input file> is the name of a stream of serialized\n",
81  "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
82  "\t<output dirctory> is the directory to output each input file after\n",
83  "\t\textracting features\n",
84  "\t<input symbol table> is an optional input file containing a Symbols\n",
85  "\t\tinstance serialized as a sequence of Symbol messages\n",
86  "\t<output symbol table> is an optional output file to which a Symbols\n",
87  "\t\tinstance will be serialized as a sequence of Symbol messages\n",
88  "\t-u specifies that the input files should be uncompressed (compression\n",
89  "\t\tis used by default)\n",
90  "\t--no-base64 specifies not to use base64 encoding/decoding\n",
91  "\t--compile specifies to compile features after each CandidateSet is read\n",
92  "\t--clear-raw specified to clear each Candidate of its raw data string\n",
93  "\t--max-examples specifies the maximum number of examples to read from\n",
94  "\t\tany input file (defaults to " XSTR(DEFAULT_MAX_EXAMPLES) ")\n",
95  "\t--max-candidates specifies the maximum number of candidates to read\n",
96  "\t\tfor any candidate set (defaults to " XSTR(DEFAULT_MAX_CANDIDATES) ")\n",
97  "\t-r specifies the interval at which the CandidateSetReader reports how\n",
98  "\t\tmany candidate sets it has read (defaults to "
100 };
101 
104 void usage() {
105  int usage_msg_len = sizeof(usage_msg)/sizeof(const char *);
106  for (int i = 0; i < usage_msg_len; ++i) {
107  cout << usage_msg[i];
108  }
109  cout.flush();
110 }
111 
112 bool check_for_required_arg(int argc, int i, string err_msg) {
113  if (i + 1 >= argc) {
114  cerr << PROG_NAME << ": error: " << err_msg << endl;
115  usage();
116  return false;
117  } else {
118  return true;
119  }
120 }
121 
122 int
123 main(int argc, char **argv) {
124  // Required parameters.
125  bool do_feature_extraction = false;
126  string feature_extractor_config_file;
127  vector<string> input_files;
128  bool compressed = true;
129  bool use_base64 = true;
130  bool compile = false;
131  bool clear_raw = false;
132  string output_dir;
133  string symbol_table_input_file = "";
134  string symbol_table_output_file = "";
135  int max_examples = DEFAULT_MAX_EXAMPLES;
136  int max_candidates = DEFAULT_MAX_CANDIDATES;
137  int reporting_interval = DEFAULT_REPORTING_INTERVAL;
138 
139  // Process options. The majority of code in this file is devoted to this.
140  for (int i = 1; i < argc; ++i) {
141  string arg = argv[i];
142  if (arg == "-c" || arg == "-config" || arg == "--config") {
143  string err_msg =
144  string("no feature extractor config file specified with ") + arg;
145  if (!check_for_required_arg(argc, i, err_msg)) {
146  return -1;
147  }
148  do_feature_extraction = true;
149  feature_extractor_config_file = argv[++i];
150  } else if (arg == "-i" || arg == "-input" || arg == "--input") {
151  string err_msg = string("no input files specified with ") + arg;
152  if (!check_for_required_arg(argc, i, err_msg)) {
153  return -1;
154  }
155  // Keep reading args until next option or until no more args.
156  ++i;
157  for ( ; i < argc; ++i) {
158  if (argv[i][0] == '-') {
159  --i;
160  break;
161  }
162  input_files.push_back(argv[i]);
163  }
164  } else if (arg == "-o" || arg == "-output" || arg == "--output") {
165  string err_msg = string("no output directory specified with ") + arg;
166  if (!check_for_required_arg(argc, i, err_msg)) {
167  return -1;
168  }
169  output_dir = argv[++i];
170  // Remove final slash, if present.
171  if (output_dir.size() > 0 && output_dir[output_dir.size() - 1] == '/') {
172  output_dir = output_dir.substr(0, output_dir.size() - 1);
173  }
174  } else if (arg == "-input-symbols" || arg == "--input-symbols") {
175  string err_msg =
176  string("no symbol table input file specified with ") + arg;
177  if (!check_for_required_arg(argc, i, err_msg)) {
178  return -1;
179  }
180  symbol_table_input_file = argv[++i];
181  } else if (arg == "-output-symbols" || arg == "--output-symbols") {
182  string err_msg =
183  string("no symbol table output file specified with ") + arg;
184  if (!check_for_required_arg(argc, i, err_msg)) {
185  return -1;
186  }
187  symbol_table_output_file = argv[++i];
188  } else if (arg == "-u") {
189  compressed = false;
190  } else if (arg == "--no-base64") {
191  use_base64 = false;
192  } else if (arg == "-compile" || arg == "--compile") {
193  compile = true;
194  } else if (arg == "-clear-raw" || arg == "--clear-raw") {
195  clear_raw = true;
196  } else if (arg == "-max-examples" || arg == "--max-examples") {
197  string err_msg = string("no arg specified with ") + arg;
198  if (!check_for_required_arg(argc, i, err_msg)) {
199  return -1;
200  }
201  max_examples = atoi(argv[++i]);
202  } else if (arg == "-max-candidates" || arg == "--max-candidates") {
203  string err_msg = string("no arg specified with ") + arg;
204  if (!check_for_required_arg(argc, i, err_msg)) {
205  return -1;
206  }
207  max_candidates = atoi(argv[++i]);
208  } else if (arg == "-r") {
209  string err_msg = string("no arg specified with ") + arg;
210  if (!check_for_required_arg(argc, i, err_msg)) {
211  return -1;
212  }
213  reporting_interval = atoi(argv[++i]);
214  } else if (arg.size() > 0 && arg[0] == '-') {
215  cerr << PROG_NAME << ": error: unrecognized option: " << arg << endl;
216  usage();
217  return -1;
218  }
219  }
220 
221  // Check that user specified required args.
222  if (input_files.size() == 0) {
223  cerr << PROG_NAME << ": error: no candidate set input files specified"
224  << endl;
225  usage();
226  return -1;
227  }
228 
229  if (output_dir == "") {
230  cerr << PROG_NAME << ": error: no output directory specified" << endl;
231  usage();
232  return -1;
233  }
234 
235  // Now, we finally get to the meat of the code for this executable.
236  shared_ptr<Symbols> symbols;
237  if (symbol_table_input_file != "") {
238  ConfusionProtoIO proto_reader(symbol_table_input_file,
239  ConfusionProtoIO::READ,
240  compressed, use_base64);
241  SymbolMessage symbol_message;
242  while (proto_reader.Read(&symbol_message)) {
243  symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
244  }
245  proto_reader.Close();
246  } else {
247  symbols = shared_ptr<Symbols>(new LocalSymbolTable());
248  }
249 
250  shared_ptr<ExecutiveFeatureExtractor> efe;
251  if (do_feature_extraction) {
252  efe =
253  ExecutiveFeatureExtractor::InitFromSpec(feature_extractor_config_file);
254  }
255 
256  int verbosity = 1;
257  MultiFileCandidateSetIterator csi(input_files,
258  efe,
259  max_examples,
260  max_candidates,
261  reporting_interval,
262  verbosity,
263  compressed,
264  use_base64);
265 
266  // Set things up for streaming output.
267  CandidateSetWriter csw(reporting_interval);
268  csw.set_verbosity(1);
269  string input_file("");
270 
271  while (csi.HasNext()) {
272  if (csi.curr_file() != input_file) {
273  if (input_file != "") {
274  csw.Close();
275  }
276  input_file = csi.curr_file();
277  size_t slash_idx = input_file.find_last_of("/");
278  string tail =
279  input_file.substr(slash_idx != string::npos ? slash_idx + 1 : 0);
280  string output_file = output_dir + "/" + tail;
281  csw.Reset();
282  csw.Open(output_file, compressed, use_base64);
283  }
284  CandidateSet &candidate_set = csi.Next();
285  if (compile) {
286  candidate_set.CompileFeatures(symbols.get());
287  }
288  if (clear_raw) {
289  candidate_set.ClearRawData();
290  }
291  bool success = csw.WriteNext(candidate_set);
292  if (!success) {
293  cerr << "Uh-oh! Couldn't write " << candidate_set.reference_string() << endl;
294  }
295  }
296  csw.Close();
297 
298  // Finally, output a symbol table if user specified one.
299  if (symbol_table_output_file != "") {
300  cerr << "Writing out Symbol protocol buffer messages to file \""
301  << symbol_table_output_file << "\"." << endl;
302  ConfusionProtoIO proto_writer(symbol_table_output_file,
303  ConfusionProtoIO::WRITE,
304  compressed, use_base64);
305  for (Symbols::const_iterator it = symbols->begin();
306  it != symbols->end();
307  ++it) {
308  SymbolMessage symbol_message;
309  symbol_message.set_symbol(it->first);
310  symbol_message.set_index(it->second);
311  proto_writer.Write(symbol_message);
312  }
313  proto_writer.Close();
314  }
315 
316  TearDown();
317  google::protobuf::ShutdownProtobufLibrary();
318 }
void Open(const string &filename, bool compressed, bool use_base64)
const string & reference_string() const
#define DEFAULT_REPORTING_INTERVAL
Provides an interface and some implementations for iterating over CandidateSet instances.
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
bool check_for_required_arg(int argc, int i, string err_msg)
virtual CandidateSet & Next()
Returns the next CandidateSet.
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set.
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
Definition: symbol-table.H:185
#define DEFAULT_MAX_CANDIDATES
#define XSTR(arg)
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes).
A class for writing streams of training or test instances, where each training or test instance is a ...
Provides the reranker::ExecutiveFeatureExtractor class.
unordered_map< string, int >::const_iterator const_iterator
Definition: symbol-table.H:60
void Reset()
Resets this writer so that its internal count of the number of CandidateSet’s written goes back to ze...
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
A class to hold a set of candidates, either for training or test.
Definition: candidate-set.H:62
bool WriteNext(const CandidateSet &candidate_set)
virtual bool HasNext() const
Returns whether this iterator contains another CandidateSet.
void usage()
const char * usage_msg[]
Class for writing streams of training or test instances, where each training or test instance is a re...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
#define DEFAULT_MAX_EXAMPLES
#define PROG_NAME
int main(int argc, char **argv)