45 #include "../proto/dataio.h" 
   52 #define PROG_NAME "compile-features" 
   54 #define DEFAULT_MAX_EXAMPLES -1 
   55 #define DEFAULT_MAX_CANDIDATES -1 
   56 #define DEFAULT_REPORTING_INTERVAL 1000 
   59 #define XSTR(arg) STR(arg) 
   63 using namespace reranker;
 
   64 using confusion_learning::SymbolMessage;
 
   68   PROG_NAME " -i|--input <candidate set input file>+\n",
 
   69   "\t[-d|--decompile]\n",
 
   70   "\t[--input-symbols <input symbol table>]\n",
 
   72   "\t[--max-examples <max num examples>]\n",
 
   73   "\t[--max-candidates <max num candidates>]\n",
 
   74   "\t[-r <reporting interval>]\n",
 
   76   "\t<candidate set input file> is the name of a stream of serialized\n",
 
   77   "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
 
   78   "\t<input symbol table> is an optional input file containing a Symbols\n",
 
   79   "\t\tinstance serialized as a sequence of Symbol messages\n",
 
   80   "\t-d|--decompile indicates to decompile features\n",
 
   81   "\t--clear-raw specified to clear each Candidate of its raw data string\n",
 
   82   "\t--max-examples specifies the maximum number of examples to read from\n",
 
   84   "\t--max-candidates specifies the maximum number of candidates to read\n",
 
   86   "\t-r specifies the interval at which the CandidateSetReader reports how\n",
 
   87   "\t\tmany candidate sets it has read (defaults to " 
   94   int usage_msg_len = 
sizeof(
usage_msg)/
sizeof(
const char *);
 
   95   for (
int i = 0; i < usage_msg_len; ++i) {
 
  103     cerr << 
PROG_NAME << 
": error: " << err_msg << endl;
 
  114   vector<string> input_files;
 
  115   bool compile_or_decompile = 
false;
 
  116   bool decompile = 
false;
 
  117   bool clear_raw = 
false;
 
  118   string symbol_table_input_file = 
"";
 
  124   for (
int i = 1; i < argc; ++i) {
 
  125     string arg = argv[i];
 
  126     if (arg == 
"-i" || arg == 
"-input" || arg == 
"--input") {
 
  127       string err_msg = string(
"no input files specified with ") + arg;
 
  133       for ( ; i < argc; ++i) {
 
  134         if (argv[i][0] == 
'-' && strlen(argv[i]) > 1) {
 
  138         input_files.push_back(argv[i]);
 
  140     } 
else if (arg == 
"-input-symbols" || arg == 
"--input-symbols") {
 
  142           string(
"no symbol table input file specified with ") + arg;
 
  146       symbol_table_input_file = argv[++i];
 
  147       compile_or_decompile = 
true;
 
  148     } 
else if (arg == 
"-d" || arg == 
"-decompile" || arg == 
"--decompile") {
 
  150     } 
else if (arg == 
"-clear-raw" || arg == 
"--clear-raw") {
 
  152     } 
else if (arg == 
"-max-examples" || arg == 
"--max-examples") {
 
  153       string err_msg = string(
"no arg specified with ") + arg;
 
  157       max_examples = atoi(argv[++i]);
 
  158     } 
else if (arg == 
"-max-candidates" || arg == 
"--max-candidates") {
 
  159       string err_msg = string(
"no arg specified with ") + arg;
 
  163       max_candidates = atoi(argv[++i]);
 
  164     } 
else if (arg == 
"-r") {
 
  165       string err_msg = string(
"no arg specified with ") + arg;
 
  169       reporting_interval = atoi(argv[++i]);
 
  170     } 
else if (arg.size() > 0 && arg[0] == 
'-') {
 
  171       cerr << 
PROG_NAME << 
": error: unrecognized option: " << arg << endl;
 
  178   if (input_files.size() == 0) {
 
  179     cerr << 
PROG_NAME << 
": error: no candidate set input files specified" 
  185   if (decompile && !compile_or_decompile) {
 
  186     cerr << 
PROG_NAME << 
": error: cannot specify -d|--decompile without " 
  187          << 
"--input-symbols" << endl;
 
  192   bool compressed = 
true;
 
  193   bool uncompressed = 
false;
 
  194   bool use_base64 = 
true;
 
  198   if (symbol_table_input_file != 
"") {
 
  199     ConfusionProtoIO proto_reader(symbol_table_input_file,
 
  200                                   ConfusionProtoIO::READ,
 
  201                                   compressed, use_base64);
 
  202     SymbolMessage symbol_message;
 
  203     while (proto_reader.Read(&symbol_message)) {
 
  204       symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
 
  206     proto_reader.Close();
 
  211   if (compile_or_decompile) {
 
  212     csw.
Open(
"-", uncompressed, use_base64);
 
  216   shared_ptr<ExecutiveFeatureExtractor> null_efe;
 
  239     if (compile_or_decompile) {
 
  243   if (compile_or_decompile) {
 
  249          it != symbols->end();
 
  251       cout << it->first << 
"\n";
 
  257   google::protobuf::ShutdownProtobufLibrary();
 
void Open(const string &filename, bool compressed, bool use_base64)
 
#define DEFAULT_MAX_CANDIDATES
 
Provides an interface and some implementations for iterating over CandidateSet  instances. 
 
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
 
int main(int argc, char **argv)
 
#define DEFAULT_REPORTING_INTERVAL
 
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
 
virtual CandidateSet & Next()
Returns the next CandidateSet. 
 
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set. 
 
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
 
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
 
void DecompileFeatures(Symbols *symbols, bool clear_symbolic_features=false, bool clear_features=true, bool force=false)
Decompiles any non-symbolic features in the candidates in this candidate set. 
 
A class for writing streams of training or test instances, where each training or test instance is a ...
 
unordered_map< string, int >::const_iterator const_iterator
 
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
 
A class to hold a set of candidates, either for training or test. 
 
bool WriteNext(const CandidateSet &candidate_set)
 
#define DEFAULT_MAX_EXAMPLES
 
bool check_for_required_arg(int argc, int i, string err_msg)
 
void usage()
Emits usage message to standard output. 
 
virtual bool HasNext() const 
Returns whether this iterator contains another CandidateSet. 
 
Class for writing streams of training or test instances, where each training or test instance is a re...
 
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.