45 #include "../proto/dataio.h"
52 #define PROG_NAME "compile-features"
54 #define DEFAULT_MAX_EXAMPLES -1
55 #define DEFAULT_MAX_CANDIDATES -1
56 #define DEFAULT_REPORTING_INTERVAL 1000
59 #define XSTR(arg) STR(arg)
63 using namespace reranker;
64 using confusion_learning::SymbolMessage;
68 PROG_NAME " -i|--input <candidate set input file>+\n",
69 "\t[-d|--decompile]\n",
70 "\t[--input-symbols <input symbol table>]\n",
72 "\t[--max-examples <max num examples>]\n",
73 "\t[--max-candidates <max num candidates>]\n",
74 "\t[-r <reporting interval>]\n",
76 "\t<candidate set input file> is the name of a stream of serialized\n",
77 "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
78 "\t<input symbol table> is an optional input file containing a Symbols\n",
79 "\t\tinstance serialized as a sequence of Symbol messages\n",
80 "\t-d|--decompile indicates to decompile features\n",
81 "\t--clear-raw specified to clear each Candidate of its raw data string\n",
82 "\t--max-examples specifies the maximum number of examples to read from\n",
84 "\t--max-candidates specifies the maximum number of candidates to read\n",
86 "\t-r specifies the interval at which the CandidateSetReader reports how\n",
87 "\t\tmany candidate sets it has read (defaults to "
94 int usage_msg_len =
sizeof(
usage_msg)/
sizeof(
const char *);
95 for (
int i = 0; i < usage_msg_len; ++i) {
103 cerr <<
PROG_NAME <<
": error: " << err_msg << endl;
114 vector<string> input_files;
115 bool compile_or_decompile =
false;
116 bool decompile =
false;
117 bool clear_raw =
false;
118 string symbol_table_input_file =
"";
124 for (
int i = 1; i < argc; ++i) {
125 string arg = argv[i];
126 if (arg ==
"-i" || arg ==
"-input" || arg ==
"--input") {
127 string err_msg = string(
"no input files specified with ") + arg;
133 for ( ; i < argc; ++i) {
134 if (argv[i][0] ==
'-' && strlen(argv[i]) > 1) {
138 input_files.push_back(argv[i]);
140 }
else if (arg ==
"-input-symbols" || arg ==
"--input-symbols") {
142 string(
"no symbol table input file specified with ") + arg;
146 symbol_table_input_file = argv[++i];
147 compile_or_decompile =
true;
148 }
else if (arg ==
"-d" || arg ==
"-decompile" || arg ==
"--decompile") {
150 }
else if (arg ==
"-clear-raw" || arg ==
"--clear-raw") {
152 }
else if (arg ==
"-max-examples" || arg ==
"--max-examples") {
153 string err_msg = string(
"no arg specified with ") + arg;
157 max_examples = atoi(argv[++i]);
158 }
else if (arg ==
"-max-candidates" || arg ==
"--max-candidates") {
159 string err_msg = string(
"no arg specified with ") + arg;
163 max_candidates = atoi(argv[++i]);
164 }
else if (arg ==
"-r") {
165 string err_msg = string(
"no arg specified with ") + arg;
169 reporting_interval = atoi(argv[++i]);
170 }
else if (arg.size() > 0 && arg[0] ==
'-') {
171 cerr <<
PROG_NAME <<
": error: unrecognized option: " << arg << endl;
178 if (input_files.size() == 0) {
179 cerr <<
PROG_NAME <<
": error: no candidate set input files specified"
185 if (decompile && !compile_or_decompile) {
186 cerr <<
PROG_NAME <<
": error: cannot specify -d|--decompile without "
187 <<
"--input-symbols" << endl;
192 bool compressed =
true;
193 bool uncompressed =
false;
194 bool use_base64 =
true;
198 if (symbol_table_input_file !=
"") {
199 ConfusionProtoIO proto_reader(symbol_table_input_file,
200 ConfusionProtoIO::READ,
201 compressed, use_base64);
202 SymbolMessage symbol_message;
203 while (proto_reader.Read(&symbol_message)) {
204 symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
206 proto_reader.Close();
211 if (compile_or_decompile) {
212 csw.
Open(
"-", uncompressed, use_base64);
216 shared_ptr<ExecutiveFeatureExtractor> null_efe;
239 if (compile_or_decompile) {
243 if (compile_or_decompile) {
249 it != symbols->end();
251 cout << it->first <<
"\n";
257 google::protobuf::ShutdownProtobufLibrary();
void Open(const string &filename, bool compressed, bool use_base64)
#define DEFAULT_MAX_CANDIDATES
Provides an interface and some implementations for iterating over CandidateSet instances.
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
int main(int argc, char **argv)
#define DEFAULT_REPORTING_INTERVAL
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
virtual CandidateSet & Next()
Returns the next CandidateSet.
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set.
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
void DecompileFeatures(Symbols *symbols, bool clear_symbolic_features=false, bool clear_features=true, bool force=false)
Decompiles any non-symbolic features in the candidates in this candidate set.
A class for writing streams of training or test instances, where each training or test instance is a ...
unordered_map< string, int >::const_iterator const_iterator
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
A class to hold a set of candidates, either for training or test.
bool WriteNext(const CandidateSet &candidate_set)
#define DEFAULT_MAX_EXAMPLES
bool check_for_required_arg(int argc, int i, string err_msg)
void usage()
Emits usage message to standard output.
virtual bool HasNext() const
Returns whether this iterator contains another CandidateSet.
Class for writing streams of training or test instances, where each training or test instance is a re...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.