45 #include "../proto/dataio.h" 
   52 #define PROG_NAME "extract-features" 
   54 #define DEFAULT_MAX_EXAMPLES -1 
   55 #define DEFAULT_MAX_CANDIDATES -1 
   56 #define DEFAULT_REPORTING_INTERVAL 1000 
   59 #define XSTR(arg) STR(arg) 
   63 using namespace reranker;
 
   64 using confusion_learning::SymbolMessage;
 
   68   PROG_NAME " [-c|--config <feature extractor config file>]\n",
 
   69   "\t-i|--input <candidate set input file>+\n",
 
   70   "\t-o|--output <output directory>\n",
 
   71   "\t[--input-symbols <input symbol table>]\n",
 
   72   "\t[--output-symbols <output symbol table>]\n",
 
   73   "\t[-u] [--no-base64] [--compile] [--clear-raw]\n",
 
   74   "\t[--max-examples <max num examples>]\n",
 
   75   "\t[--max-candidates <max num candidates>]\n",
 
   76   "\t[-r <reporting interval>]\n",
 
   78   "\t<feature extractor config file> is the name of a configuration file\n",
 
   79   "\t\tto be read by the ExecutiveFeatureExtractor class\n",
 
   80   "\t<candidate set input file> is the name of a stream of serialized\n",
 
   81   "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
 
   82   "\t<output dirctory> is the directory to output each input file after\n",
 
   83   "\t\textracting features\n",
 
   84   "\t<input symbol table> is an optional input file containing a Symbols\n",
 
   85   "\t\tinstance serialized as a sequence of Symbol messages\n",
 
   86   "\t<output symbol table> is an optional output file to which a Symbols\n",
 
   87   "\t\tinstance will be serialized as a sequence of Symbol messages\n",
 
   88   "\t-u specifies that the input files should be uncompressed (compression\n",
 
   89   "\t\tis used by default)\n",
 
   90   "\t--no-base64 specifies not to use base64 encoding/decoding\n",
 
   91   "\t--compile specifies to compile features after each CandidateSet is read\n",
 
   92   "\t--clear-raw specified to clear each Candidate of its raw data string\n",
 
   93   "\t--max-examples specifies the maximum number of examples to read from\n",
 
   95   "\t--max-candidates specifies the maximum number of candidates to read\n",
 
   97   "\t-r specifies the interval at which the CandidateSetReader reports how\n",
 
   98   "\t\tmany candidate sets it has read (defaults to " 
  105   int usage_msg_len = 
sizeof(
usage_msg)/
sizeof(
const char *);
 
  106   for (
int i = 0; i < usage_msg_len; ++i) {
 
  114     cerr << 
PROG_NAME << 
": error: " << err_msg << endl;
 
  125   bool do_feature_extraction = 
false;
 
  126   string feature_extractor_config_file;
 
  127   vector<string> input_files;
 
  128   bool compressed = 
true;
 
  129   bool use_base64 = 
true;
 
  130   bool compile = 
false;
 
  131   bool clear_raw = 
false;
 
  133   string symbol_table_input_file = 
"";
 
  134   string symbol_table_output_file = 
"";
 
  140   for (
int i = 1; i < argc; ++i) {
 
  141     string arg = argv[i];
 
  142     if (arg == 
"-c" || arg == 
"-config" || arg == 
"--config") {
 
  144           string(
"no feature extractor config file specified with ") + arg;
 
  148       do_feature_extraction = 
true;
 
  149       feature_extractor_config_file = argv[++i];
 
  150     } 
else if (arg == 
"-i" || arg == 
"-input" || arg == 
"--input") {
 
  151       string err_msg = string(
"no input files specified with ") + arg;
 
  157       for ( ; i < argc; ++i) {
 
  158         if (argv[i][0] == 
'-') {
 
  162         input_files.push_back(argv[i]);
 
  164     } 
else if (arg == 
"-o" || arg == 
"-output" || arg == 
"--output") {
 
  165       string err_msg = string(
"no output directory specified with ") + arg;
 
  169       output_dir = argv[++i];
 
  171       if (output_dir.size() > 0 && output_dir[output_dir.size() - 1] == 
'/') {
 
  172         output_dir = output_dir.substr(0, output_dir.size() - 1);
 
  174     } 
else if (arg == 
"-input-symbols" || arg == 
"--input-symbols") {
 
  176           string(
"no symbol table input file specified with ") + arg;
 
  180       symbol_table_input_file = argv[++i];      
 
  181     } 
else if (arg == 
"-output-symbols" || arg == 
"--output-symbols") {
 
  183           string(
"no symbol table output file specified with ") + arg;
 
  187       symbol_table_output_file = argv[++i];      
 
  188     } 
else if (arg == 
"-u") {
 
  190     } 
else if (arg == 
"--no-base64") {
 
  192     } 
else if (arg == 
"-compile" || arg == 
"--compile") {
 
  194     } 
else if (arg == 
"-clear-raw" || arg == 
"--clear-raw") {
 
  196     } 
else if (arg == 
"-max-examples" || arg == 
"--max-examples") {
 
  197       string err_msg = string(
"no arg specified with ") + arg;
 
  201       max_examples = atoi(argv[++i]);
 
  202     } 
else if (arg == 
"-max-candidates" || arg == 
"--max-candidates") {
 
  203       string err_msg = string(
"no arg specified with ") + arg;
 
  207       max_candidates = atoi(argv[++i]);
 
  208     } 
else if (arg == 
"-r") {
 
  209       string err_msg = string(
"no arg specified with ") + arg;
 
  213       reporting_interval = atoi(argv[++i]);
 
  214     } 
else if (arg.size() > 0 && arg[0] == 
'-') {
 
  215       cerr << 
PROG_NAME << 
": error: unrecognized option: " << arg << endl;
 
  222   if (input_files.size() == 0) {
 
  223     cerr << 
PROG_NAME << 
": error: no candidate set input files specified" 
  229   if (output_dir == 
"") {
 
  230     cerr << 
PROG_NAME << 
": error: no output directory specified" << endl;
 
  236   shared_ptr<Symbols> symbols;
 
  237   if (symbol_table_input_file != 
"") {
 
  238     ConfusionProtoIO proto_reader(symbol_table_input_file,
 
  239                                   ConfusionProtoIO::READ,
 
  240                                   compressed, use_base64);
 
  241     SymbolMessage symbol_message;
 
  242     while (proto_reader.Read(&symbol_message)) {
 
  243       symbols->SetIndex(symbol_message.symbol(), symbol_message.index());
 
  245     proto_reader.Close();
 
  250   shared_ptr<ExecutiveFeatureExtractor> efe;
 
  251   if (do_feature_extraction) {
 
  253         ExecutiveFeatureExtractor::InitFromSpec(feature_extractor_config_file);
 
  269   string input_file(
"");
 
  273       if (input_file != 
"") {
 
  277       size_t slash_idx = input_file.find_last_of(
"/");
 
  279           input_file.substr(slash_idx != string::npos ? slash_idx + 1 : 0);
 
  280       string output_file = output_dir + 
"/" + tail;
 
  282       csw.
Open(output_file, compressed, use_base64);
 
  291     bool success = csw.
WriteNext(candidate_set);
 
  293       cerr << 
"Uh-oh! Couldn't write " << candidate_set.
reference_string() << endl;
 
  299   if (symbol_table_output_file != 
"") {
 
  300     cerr << 
"Writing out Symbol protocol buffer messages to file \"" 
  301          << symbol_table_output_file << 
"\"." << endl;
 
  302     ConfusionProtoIO proto_writer(symbol_table_output_file,
 
  303                                   ConfusionProtoIO::WRITE,
 
  304                                   compressed, use_base64);
 
  306          it != symbols->end();
 
  308       SymbolMessage symbol_message;
 
  309       symbol_message.set_symbol(it->first);
 
  310       symbol_message.set_index(it->second);
 
  311       proto_writer.Write(symbol_message);
 
  313     proto_writer.Close();
 
  317   google::protobuf::ShutdownProtobufLibrary();
 
void Open(const string &filename, bool compressed, bool use_base64)
 
const string & reference_string() const 
 
Provides an interface and some implementations for iterating over CandidateSet  instances. 
 
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
 
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
 
virtual CandidateSet & Next()
Returns the next CandidateSet. 
 
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set. 
 
void ClearRawData()
Clears the raw data for all candidates in this set by setting each to be the empty string...
 
A symbol table that stores the mapping from symbols to int’s and vice versa in local (non-static) dat...
 
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes). 
 
const string curr_file() const 
 
A class for writing streams of training or test instances, where each training or test instance is a ...
 
unordered_map< string, int >::const_iterator const_iterator
 
void Reset()
Resets this writer so that its internal count of the number of CandidateSet’s written goes back to ze...
 
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
 
A class to hold a set of candidates, either for training or test. 
 
bool WriteNext(const CandidateSet &candidate_set)
 
virtual bool HasNext() const 
Returns whether this iterator contains another CandidateSet. 
 
Class for writing streams of training or test instances, where each training or test instance is a re...
 
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.