Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
candidate-set-reader.H
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
36 
37 #ifndef RERANKER_CANDIDATE_SET_READER_H_
38 #define RERANKER_CANDIDATE_SET_READER_H_
39 
40 #include <iostream>
41 #include <memory>
42 #include <vector>
43 
44 #include "candidate-set.H"
46 #include "../proto/dataio.h"
47 
48 #define DEFAULT_READER_REPORTING_INTERVAL 100
49 
50 namespace reranker {
51 
52 using std::cerr;
53 using std::endl;
54 using std::string;
55 using std::vector;
56 using std::shared_ptr;
57 
63  public:
65  CandidateSetReader(long reporting_interval =
67  max_num_to_read_(-1),
68  max_candidates_per_set_(-1),
69  num_read_(0),
70  num_read_from_file_(0),
71  total_num_read_(0),
72  interval_read_(0),
73  reporting_interval_(reporting_interval) { }
74  CandidateSetReader(int max_num_to_read,
75  int max_candidates_per_set,
76  long reporting_interval =
78  max_num_to_read_(max_num_to_read),
79  max_candidates_per_set_(max_candidates_per_set),
80  num_read_(0),
81  num_read_from_file_(0),
82  total_num_read_(0),
83  interval_read_(0),
84  reporting_interval_(reporting_interval) { }
85  virtual ~CandidateSetReader() { }
86 
87  void Open(const string &filename, bool compressed, bool use_base64,
88  bool reset_counters = true) {
89  if (reset_counters) {
90  Reset();
91  }
92  if (verbosity_ >= 1) {
93  cerr << "CandidateSetReader: reading from file \"" << filename
94  << "\"." << endl;
95  }
96  bool reading_from_stdin = filename == "-";
97  ConfusionProtoIO::Mode mode =
98  reading_from_stdin ? ConfusionProtoIO::READSTD : ConfusionProtoIO::READ;
99  compressed = reading_from_stdin ? false : compressed;
100  reader_ = new ConfusionProtoIO(filename, mode, compressed, use_base64);
101  filename_ = filename;
102  num_read_from_file_ = 0;
103  }
104 
118  void Read(const string &filename,
119  bool compressed,
120  bool use_base64,
121  bool reset_counters,
122  vector<shared_ptr<CandidateSet> > &examples) {
123  Open(filename, compressed, use_base64, reset_counters);
124 
125  bool reader_valid = true;
126  while (reader_valid) {
127  shared_ptr<CandidateSet> candidate_set = ReadNext(reader_valid);
128  if (candidate_set.get() == NULL) {
129  break;
130  }
131  examples.push_back(candidate_set);
132  }
133  Close();
134  }
135 
136  shared_ptr<CandidateSet> ReadNext(bool &reader_valid) {
137  if (num_read_ == max_num_to_read_) {
138  return shared_ptr<CandidateSet>();
139  }
140  // First, de-serialize next CandidateSetMessage from stream.
141  confusion_learning::CandidateSetMessage tmp_msg;
142  reader_valid = reader_->Read(&tmp_msg);
143  if (reader_valid) {
144  if (verbosity_ >= 3) {
145  cerr << "CandidateSetReader: most recent CandidateSetMessage: "
146  << tmp_msg.Utf8DebugString();
147  }
148  } else {
149  return shared_ptr<CandidateSet>();
150  }
151 
152  shared_ptr<CandidateSet> candidate_set(new CandidateSet());
153  candidate_set_proto_reader_.Read(tmp_msg, max_candidates_per_set_,
154  *candidate_set);
155 
156  if (verbosity_ >= 2) {
157  cerr << "CandidateSetReader: candidate set prior to "
158  << "feature compilation:" << endl << *(candidate_set);
159  }
160 
161  ++num_read_;
162  ++num_read_from_file_;
163  ++total_num_read_;
164  ++interval_read_;
165 
166  if (interval_read_ == reporting_interval_) {
167  if (verbosity_ >= 1) {
168  cerr << "CandidateSetReader: read " << num_read_
169  << " candidate sets." << endl;
170  }
171  interval_read_ = 0;
172  }
173  return candidate_set;
174  }
175 
176  void Close() {
177  if (verbosity_ >= 1) {
178  cerr << "CandidateSetReader: read " << num_read_from_file_
179  << " candidate sets from file \"" << filename_ << "\". Closing file."
180  << endl;
181  }
182  reader_->Close();
183  delete reader_;
184  }
185 
188  void Reset() {
189  num_read_ = 0;
190  interval_read_ = 0;
191  }
192 
195  void ClearStrings() {
196  candidate_set_proto_reader_.ClearStrings();
197  }
198 
199  // accessors
200 
203  long num_read() { return num_read_; }
204 
206  long total_num_read() { return total_num_read_; }
207 
208  // mutators
209 
221  void set_verbosity(int verbosity) { verbosity_ = verbosity; }
222 
223  private:
224  // data members
225  ConfusionProtoIO *reader_;
226  CandidateSetProtoReader candidate_set_proto_reader_;
227  string filename_;
228  int max_num_to_read_;
229  int max_candidates_per_set_;
230  long num_read_;
231  long num_read_from_file_;
232  long total_num_read_;
233  long interval_read_;
234  long reporting_interval_;
235  int verbosity_;
236 };
237 
238 } // namespace reranker
239 
240 #endif
void ClearStrings()
Invokes CandidateSetProtoReader::ClearStrings on the internal CandidateSetProtoReader instance...
void Read(const CandidateSetMessage &m, CandidateSet &set)
Fills in the specified CandidateSet based on the specified CandidateSetMessage, crucially constructin...
void set_verbosity(int verbosity)
Sets the verbosity of this reader (mostly for debugging purposes).
CandidateSetReader(long reporting_interval=100)
Constructs a new instance.
#define DEFAULT_READER_REPORTING_INTERVAL
A class to hold a set of candidates, either for training or test.
Definition: candidate-set.H:62
void Read(const string &filename, bool compressed, bool use_base64, bool reset_counters, vector< shared_ptr< CandidateSet > > &examples)
Reads a stream of CandidateSet instances from the specified file or from standard input...
void Reset()
Resets this reader so that its internal count of the number of CandidateSet’s read goes back to zero...
Reads CandidateSetMessage instances and converts them to reranker::CandidateSet instances.
shared_ptr< CandidateSet > ReadNext(bool &reader_valid)
A class to fill in a CandidateSet based on a CandidateSetMessage, crucially constructing new Candidat...
CandidateSetReader(int max_num_to_read, int max_candidates_per_set, long reporting_interval=100)
long total_num_read()
Returns the total number of CandidateSet instances read.
long num_read()
Returns the number of CandidateSet instances read since the last invocation of the Reset method...
A class for reading streams of training or test instances, where each training or test instance is a ...
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
void Open(const string &filename, bool compressed, bool use_base64, bool reset_counters=true)