45 #include "../proto/dataio.h"
52 #define PROG_NAME "extract-features"
54 #define DEFAULT_MAX_EXAMPLES -1
55 #define DEFAULT_MAX_CANDIDATES -1
56 #define DEFAULT_REPORTING_INTERVAL 1000
59 #define XSTR(arg) STR(arg)
63 using namespace reranker;
64 using confusion_learning::SymbolMessage;
68 PROG_NAME " [-c|--config <feature extractor config file>]\n",
69 "\t-i|--input <candidate set input file>+\n",
70 "\t-o|--output <output directory>\n",
71 "\t[--input-symbols <input symbol table>]\n",
72 "\t[--output-symbols <output symbol table>]\n",
73 "\t[-u] [--no-base64] [--compile] [--clear-raw]\n",
74 "\t[--max-examples <max num examples>]\n",
75 "\t[--max-candidates <max num candidates>]\n",
76 "\t[-r <reporting interval>]\n",
78 "\t<feature extractor config file> is the name of a configuration file\n",
79 "\t\tto be read by the ExecutiveFeatureExtractor class\n",
80 "\t<candidate set input file> is the name of a stream of serialized\n",
81 "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
82 "\t<output dirctory> is the directory to output each input file after\n",
83 "\t\textracting features\n",
84 "\t<input symbol table> is an optional input file containing a Symbols\n",
85 "\t\tinstance serialized as a sequence of Symbol messages\n",
86 "\t<output symbol table> is an optional output file to which a Symbols\n",
87 "\t\tinstance will be serialized as a sequence of Symbol messages\n",
88 "\t-u specifies that the input files should be uncompressed (compression\n",
89 "\t\tis used by default)\n",
90 "\t--no-base64 specifies not to use base64 encoding/decoding\n",
91 "\t--compile specifies to compile features after each CandidateSet is read\n",
92 "\t--clear-raw specified to clear each Candidate of its raw data string\n",
93 "\t--max-examples specifies the maximum number of examples to read from\n",
95 "\t--max-candidates specifies the maximum number of candidates to read\n",
97 "\t-r specifies the interval at which the CandidateSetReader reports how\n",
98 "\t\tmany candidate sets it has read (defaults to "
105 int usage_msg_len =
sizeof(
usage_msg)/
sizeof(
const char *);
106 for (
int i = 0; i < usage_msg_len; ++i) {
114 cerr <<
PROG_NAME <<
": error: " << err_msg << endl;
125 bool do_feature_extraction =
false;
126 string feature_extractor_config_file;
127 vector<string> input_files;
128 bool compressed =
true;
129 bool use_base64 =
true;
130 bool compile =
false;
131 bool clear_raw =
false;
133 string symbol_table_input_file =
"";
134 string symbol_table_output_file =
"";
140 for (
int i = 1; i < argc; ++i) {
141 string arg = argv[i];
142 if (arg ==
"-c" || arg ==
"-config" || arg ==
"--config") {
144 string(
"no feature extractor config file specified with ") + arg;
148 do_feature_extraction =
true;
149 feature_extractor_config_file = argv[++i];
150 }
else if (arg ==
"-i" || arg ==
"-input" || arg ==
"--input") {
151 string err_msg = string(
"no input files specified with ") + arg;
157 for ( ; i < argc; ++i) {
158 if (argv[i][0] ==
'-') {
162 input_files.push_back(argv[i]);
164 }
else if (arg ==
"-o" || arg ==
"-output" || arg ==
"--output") {
165 string err_msg = string(
"no output directory specified with ") + arg;
169 output_dir = argv[++i];
171 if (output_dir.size() > 0 && output_dir[output_dir.size() - 1] ==
'/') {
172 output_dir = output_dir.substr(0, output_dir.size() - 1);
174 }
else if (arg ==
"-input-symbols" || arg ==
"--input-symbols") {
176 string(
"no symbol table input file specified with ") + arg;
180 symbol_table_input_file = argv[++i];
181 }
else if (arg ==
"-output-symbols" || arg ==
"--output-symbols") {
183 string(
"no symbol table output file specified with ") + arg;
187 symbol_table_output_file = argv[++i];
188 }
else if (arg ==
"-u") {
190 }
else if (arg ==
"--no-base64") {
192 }
else if (arg ==
"-compile" || arg ==
"--compile") {
194 }
else if (arg ==
"-clear-raw" || arg ==
"--clear-raw") {
196 }
else if (arg ==
"-max-examples" || arg ==
"--max-examples") {
197 string err_msg = string(
"no arg specified with ") + arg;
201 max_examples = atoi(argv[++i]);
202 }
else if (arg ==
"-max-candidates" || arg ==
"--max-candidates") {
203 string err_msg = string(
"no arg specified with ") + arg;
207 max_candidates = atoi(argv[++i]);
208 }
else if (arg ==
"-r") {
209 string err_msg = string(
"no arg specified with ") + arg;
213 reporting_interval = atoi(argv[++i]);
214 }
else if (arg.size() > 0 && arg[0] ==
'-') {
215 cerr <<
PROG_NAME <<
": error: unrecognized option: " << arg << endl;
222 if (input_files.size() == 0) {
223 cerr <<
PROG_NAME <<
": error: no candidate set input files specified"
229 if (output_dir ==
"") {
230 cerr <<
PROG_NAME <<
": error: no output directory specified" << endl;
236 shared_ptr<Symbols> symbols;
237 if (symbol_table_input_file !=
"") {
238 ConfusionProtoIO proto_reader(symbol_table_input_file,
239 ConfusionProtoIO::READ,
240 compressed, use_base64);
241 SymbolMessage symbol_message;
242 while (proto_reader.Read(&symbol_message)) {
243 symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
245 proto_reader.Close();
250 shared_ptr<ExecutiveFeatureExtractor> efe;
251 if (do_feature_extraction) {
253 ExecutiveFeatureExtractor::InitFromSpec(feature_extractor_config_file);
269 string input_file(
"");
273 if (input_file !=
"") {
277 size_t slash_idx = input_file.find_last_of(
"/");
279 input_file.substr(slash_idx != string::npos ? slash_idx + 1 : 0);
280 string output_file = output_dir +
"/" + tail;
282 csw.
Open(output_file, compressed, use_base64);
291 bool success = csw.
WriteNext(candidate_set);
293 cerr <<
"Uh-oh! Couldn't write " << candidate_set.
reference_string() << endl;
299 if (symbol_table_output_file !=
"") {
300 cerr <<
"Writing out Symbol protocol buffer messages to file \""
301 << symbol_table_output_file <<
"\"." << endl;
302 ConfusionProtoIO proto_writer(symbol_table_output_file,
303 ConfusionProtoIO::WRITE,
304 compressed, use_base64);
306 it != symbols->end();
308 SymbolMessage symbol_message;
309 symbol_message.set_symbol(it->first);
310 symbol_message.set_index(it->second);
311 proto_writer.Write(symbol_message);
313 proto_writer.Close();
317 google::protobuf::ShutdownProtobufLibrary();
void Open(const string &filename, bool compressed, bool use_base64)
const string & reference_string() const
Provides an interface and some implementations for iterating over CandidateSet instances.
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
virtual CandidateSet & Next()
Returns the next CandidateSet.
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set.
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes).
const string curr_file() const
A class for writing streams of training or test instances, where each training or test instance is a ...
unordered_map< string, int >::const_iterator const_iterator
void Reset()
Resets this writer so that its internal count of the number of CandidateSet’s written goes back to ze...
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
A class to hold a set of candidates, either for training or test.
bool WriteNext(const CandidateSet &candidate_set)
virtual bool HasNext() const
Returns whether this iterator contains another CandidateSet.
Class for writing streams of training or test instances, where each training or test instance is a re...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.