37 #ifndef RERANKER_CANDIDATE_SET_READER_H_
38 #define RERANKER_CANDIDATE_SET_READER_H_
46 #include "../proto/dataio.h"
48 #define DEFAULT_READER_REPORTING_INTERVAL 100
56 using std::shared_ptr;
68 max_candidates_per_set_(-1),
70 num_read_from_file_(0),
73 reporting_interval_(reporting_interval) { }
75 int max_candidates_per_set,
76 long reporting_interval =
78 max_num_to_read_(max_num_to_read),
79 max_candidates_per_set_(max_candidates_per_set),
81 num_read_from_file_(0),
84 reporting_interval_(reporting_interval) { }
87 void Open(
const string &filename,
bool compressed,
bool use_base64,
88 bool reset_counters =
true) {
92 if (verbosity_ >= 1) {
93 cerr <<
"CandidateSetReader: reading from file \"" << filename
96 bool reading_from_stdin = filename ==
"-";
97 ConfusionProtoIO::Mode mode =
98 reading_from_stdin ? ConfusionProtoIO::READSTD : ConfusionProtoIO::READ;
99 compressed = reading_from_stdin ?
false : compressed;
100 reader_ =
new ConfusionProtoIO(filename, mode, compressed, use_base64);
101 filename_ = filename;
102 num_read_from_file_ = 0;
118 void Read(
const string &filename,
122 vector<shared_ptr<CandidateSet> > &examples) {
123 Open(filename, compressed, use_base64, reset_counters);
125 bool reader_valid =
true;
126 while (reader_valid) {
127 shared_ptr<CandidateSet> candidate_set =
ReadNext(reader_valid);
128 if (candidate_set.get() == NULL) {
131 examples.push_back(candidate_set);
136 shared_ptr<CandidateSet>
ReadNext(
bool &reader_valid) {
137 if (num_read_ == max_num_to_read_) {
138 return shared_ptr<CandidateSet>();
141 confusion_learning::CandidateSetMessage tmp_msg;
142 reader_valid = reader_->Read(&tmp_msg);
144 if (verbosity_ >= 3) {
145 cerr <<
"CandidateSetReader: most recent CandidateSetMessage: "
146 << tmp_msg.Utf8DebugString();
149 return shared_ptr<CandidateSet>();
152 shared_ptr<CandidateSet> candidate_set(
new CandidateSet());
153 candidate_set_proto_reader_.
Read(tmp_msg, max_candidates_per_set_,
156 if (verbosity_ >= 2) {
157 cerr <<
"CandidateSetReader: candidate set prior to "
158 <<
"feature compilation:" << endl << *(candidate_set);
162 ++num_read_from_file_;
166 if (interval_read_ == reporting_interval_) {
167 if (verbosity_ >= 1) {
168 cerr <<
"CandidateSetReader: read " << num_read_
169 <<
" candidate sets." << endl;
173 return candidate_set;
177 if (verbosity_ >= 1) {
178 cerr <<
"CandidateSetReader: read " << num_read_from_file_
179 <<
" candidate sets from file \"" << filename_ <<
"\". Closing file."
225 ConfusionProtoIO *reader_;
228 int max_num_to_read_;
229 int max_candidates_per_set_;
231 long num_read_from_file_;
232 long total_num_read_;
234 long reporting_interval_;
void ClearStrings()
Invokes CandidateSetProtoReader::ClearStrings on the internal CandidateSetProtoReader instance...
void Read(const CandidateSetMessage &m, CandidateSet &set)
Fills in the specified CandidateSet based on the specified CandidateSetMessage, crucially constructin...
void set_verbosity(int verbosity)
Sets the verbosity of this reader (mostly for debugging purposes).
virtual ~CandidateSetReader()
CandidateSetReader(long reporting_interval=100)
Constructs a new instance.
#define DEFAULT_READER_REPORTING_INTERVAL
A class to hold a set of candidates, either for training or test.
void Read(const string &filename, bool compressed, bool use_base64, bool reset_counters, vector< shared_ptr< CandidateSet > > &examples)
Reads a stream of CandidateSet instances from the specified file or from standard input...
void Reset()
Resets this reader so that its internal count of the number of CandidateSet’s read goes back to zero...
Reads CandidateSetMessage instances and converts them to reranker::CandidateSet instances.
shared_ptr< CandidateSet > ReadNext(bool &reader_valid)
A class to fill in a CandidateSet based on a CandidateSetMessage, crucially constructing new Candidat...
CandidateSetReader(int max_num_to_read, int max_candidates_per_set, long reporting_interval=100)
long total_num_read()
Returns the total number of CandidateSet instances read.
long num_read()
Returns the number of CandidateSet instances read since the last invocation of the Reset method...
A class for reading streams of training or test instances, where each training or test instance is a ...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
void Open(const string &filename, bool compressed, bool use_base64, bool reset_counters=true)