Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
candidate-set-writer.H
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
36 
37 #ifndef RERANKER_CANDIDATE_SET_WRITER_H_
38 #define RERANKER_CANDIDATE_SET_WRITER_H_
39 
40 #include <iostream>
41 #include <memory>
42 #include <vector>
43 
44 #include "candidate-set.H"
46 #include "../proto/dataio.h"
47 
48 #define DEFAULT_WRITER_REPORTING_INTERVAL 1000
49 
50 namespace reranker {
51 
52 using std::cerr;
53 using std::endl;
54 using std::string;
55 using std::vector;
56 using std::shared_ptr;
57 
63  public:
65  CandidateSetWriter(long reporting_interval =
67  max_num_to_write_(-1),
68  num_written_(0),
69  interval_written_(0),
70  reporting_interval_(reporting_interval),
71  verbosity_(0) { }
72  virtual ~CandidateSetWriter() { }
73 
74  void Open(const string &filename,
75  bool compressed,
76  bool use_base64) {
77  if (verbosity_ >= 1) {
78  cerr << "CandidateSetWriter: writing to file \"" << filename
79  << "\"." << endl;
80  }
81  bool writing_to_stdout = filename == "-";
82  ConfusionProtoIO::Mode mode =
83  writing_to_stdout ?
84  ConfusionProtoIO::WRITESTD : ConfusionProtoIO::WRITE;
85  compressed = writing_to_stdout ? false : compressed;
86  writer_ = new ConfusionProtoIO(filename, mode, compressed, use_base64);
87  }
88 
98  void Write(vector<shared_ptr<CandidateSet> > &examples,
99  const string &filename,
100  bool compressed,
101  bool use_base64) {
102  Open(filename, compressed, use_base64);
103  bool writer_valid = true;
104  for (vector<shared_ptr<CandidateSet> >::const_iterator it =
105  examples.begin();
106  writer_valid && it != examples.end();
107  ++it) {
108  // First, serialize current CandidateSet to a CandidateSetMessage.
109  const CandidateSet &candidate_set = *(*it);
110  writer_valid = WriteNext(candidate_set);
111  }
112  Close();
113  }
114 
115  bool WriteNext(const CandidateSet &candidate_set) {
116  if (num_written_ == max_num_to_write_) {
117  return false;
118  }
119 
120  confusion_learning::CandidateSetMessage tmp_msg;
121  candidate_set_proto_writer_.Write(candidate_set, &tmp_msg);
122 
123  // Now write it out using ConfusionProtoIO instance.
124  bool writer_valid = writer_->Write(tmp_msg);
125  if (writer_valid) {
126  if (verbosity_ >= 3) {
127  cerr << "CandidateSetWriter: most recent CandidateSetMessage: "
128  << tmp_msg.Utf8DebugString();
129  }
130 
131  if (verbosity_ >= 2) {
132  cerr << "CandidateSetWriter: candidate set " << candidate_set;
133  }
134 
135  ++num_written_;
136  ++interval_written_;
137 
138  if (interval_written_ == reporting_interval_) {
139  if (verbosity_ >= 1) {
140  cerr << "CandidateSetWriter: wrote " << num_written_
141  << " candidate sets." << endl;
142  }
143  interval_written_ = 0;
144  }
145  }
146  return writer_valid;
147  }
148 
149  void Close() {
150  writer_->Close();
151  delete writer_;
152  }
153 
156  void Reset() {
157  num_written_ = 0;
158  interval_written_ = 0;
159  }
160 
172  void set_verbosity(int verbosity) { verbosity_ = verbosity; }
173 
174  void set_max_num_to_write(int max_num_to_write) {
175  max_num_to_write_ = max_num_to_write;
176  }
177 
178  private:
179  // data members
180  ConfusionProtoIO *writer_;
181  CandidateSetProtoWriter candidate_set_proto_writer_;
182  int max_num_to_write_;
183  long num_written_;
184  long interval_written_;
185  long reporting_interval_;
186  int verbosity_;
187 };
188 
189 } // namespace reranker
190 
191 #endif
void Open(const string &filename, bool compressed, bool use_base64)
void Write(const CandidateSet &set, CandidateSetMessage *candidate_set_message) const
Serializes a CandidateSet instance to a CandidateSetMessage.
#define DEFAULT_WRITER_REPORTING_INTERVAL
A class to construct a CandidateSetMessage from a CandidateSet instance.
void Write(vector< shared_ptr< CandidateSet > > &examples, const string &filename, bool compressed, bool use_base64)
Writes a stream of CandidateSet instances to the specified file or to standard output.
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes).
A class for writing streams of training or test instances, where each training or test instance is a ...
CandidateSetWriter(long reporting_interval=1000)
Constructs a new insta.
Serializer for reranker::CandidateSet instances to CandidateSetMessage instances. ...
void Reset()
Resets this writer so that its internal count of the number of CandidateSet’s written goes back to ze...
A class to hold a set of candidates, either for training or test.
Definition: candidate-set.H:62
bool WriteNext(const CandidateSet &candidate_set)
void set_max_num_to_write(int max_num_to_write)
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.