Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
stream-tokenizer.H
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
35 
36 #ifndef RERANKER_STREAM_TOKENIZER_H_
37 #define RERANKER_STREAM_TOKENIZER_H_
38 
39 #include <iostream>
40 #include <set>
41 #include <sstream>
42 #include <stdexcept>
43 #include <string>
44 #include <string.h>
45 #include <vector>
46 
47 namespace reranker {
48 
49 using std::istream;
50 using std::istringstream;
51 using std::ostringstream;
52 using std::set;
53 using std::streampos;
54 using std::string;
55 using std::vector;
56 using std::cerr;
57 using std::endl;
58 
63 static const char *default_reserved_words[] = {
64  "-",
65  "nullptr",
66  "NULL",
67  "false",
68  "true",
69  "bool",
70  "int",
71  "double",
72  "string",
73  "bool[]",
74  "int[]",
75  "double[]",
76  "string[]",
77 };
78 
80 #define DEFAULT_RESERVED_CHARS "(){},=;/"
81 
88  public:
92  enum TokenType {
99  };
100 
102  static const char *TypeName(TokenType token_type) {
103  static const char *names[] = {
104  "EOF", "RESERVED_CHAR", "RESERVED_WORD", "STRING", "NUMBER", "IDENTIFIER"
105  };
106  return names[token_type];
107  }
108 
110  struct Token {
112  string tok;
115 
116  // The following three fields capture information about the underlying
117  // byte stream at the time this token was read from it.
118 
120  size_t start;
122  size_t line_number;
125  size_t curr_pos;
126  };
127 
134  StreamTokenizer(istream &is,
135  const char *reserved_chars = DEFAULT_RESERVED_CHARS) :
136  is_(is), num_read_(0), line_number_(0), eof_reached_(false),
137  next_token_idx_(0) {
138  Init(reserved_chars);
139  }
140 
147  StreamTokenizer(const string &s,
148  const char *reserved_chars = DEFAULT_RESERVED_CHARS) :
149  sstream_(s), is_(sstream_), num_read_(0), line_number_(0),
150  eof_reached_(false), next_token_idx_(0) {
151  Init(reserved_chars);
152  }
153 
156  void set_reserved_words(set<string> &reserved_words) {
157  reserved_words_ = reserved_words;
158  }
159 
161  virtual ~StreamTokenizer() {
162  delete[] reserved_chars_;
163  }
164 
167  string str() { return oss_.str(); }
168 
172  size_t tellg() const {
173  return HasPrev() ? token_[next_token_idx_ - 1].curr_pos : 0;
174  }
175 
179  size_t line_number() const {
180  return HasNext() ? token_[next_token_idx_].line_number : line_number_;
181  }
182 
184  bool HasNext() const { return next_token_idx_ < token_.size(); }
185 
186  bool HasPrev() const { return next_token_idx_ > 0; }
187 
188  string PeekPrev() const {
189  return HasPrev() ? token_[next_token_idx_ - 1].tok : "";
190  }
191 
192  size_t PeekPrevTokenStart() const {
193  return HasPrev() ? token_[next_token_idx_ - 1].start : 0;
194  }
195 
197  return HasPrev() ? token_[next_token_idx_ - 1].type : EOF_TYPE;
198  }
199 
201  string Next() {
202  if (!HasNext()) {
203  // Error.
204  throw std::runtime_error("invoking StreamTokenizer::Next when HasNext "
205  "returns false");
206  }
207 
208  size_t curr_token_idx = next_token_idx_;
209 
210  // Try to get the next token of the stream if we're about to run out of
211  // tokens.
212  if (!eof_reached_ && next_token_idx_ + 1 == token_.size()) {
213  Token next;
214  if (GetNext(&next)) {
215  token_.push_back(next);
216  }
217  }
218  // Ensure that we only advance if we haven't already reached token_.size().
219  if (next_token_idx_ < token_.size()) {
220  ++next_token_idx_;
221  }
222 
223  return token_[curr_token_idx].tok;
224  }
225 
228  void Rewind() {
229  next_token_idx_ = 0;
230  }
231 
236  void Rewind(size_t num_tokens) {
237  // Cannot rewind more than the number of tokens read so far.
238  if (num_tokens > next_token_idx_) {
239  num_tokens = next_token_idx_;
240  }
241  next_token_idx_ -= num_tokens;
242  }
243 
245  void Putback() {
246  Rewind(1);
247  }
248 
251  size_t PeekTokenStart() const {
252  return HasNext() ? token_[next_token_idx_].start : num_read_;
253  }
254 
258  return HasNext() ? token_[next_token_idx_].type : EOF_TYPE;
259  }
260 
264  size_t PeekTokenLineNumber() const {
265  return HasNext() ? token_[next_token_idx_].line_number : line_number_;
266  }
267 
271  string Peek() const { return HasNext() ? token_[next_token_idx_].tok : ""; }
272 
273  private:
274  void Init(const char *reserved_chars) {
275  num_reserved_chars_ = strlen(reserved_chars);
276  reserved_chars_ = new char[num_reserved_chars_];
277  strcpy(reserved_chars_, reserved_chars);
278  int num_reserved_words = sizeof(default_reserved_words)/sizeof(const char*);
279  for (int i = 0; i < num_reserved_words; ++i) {
280  reserved_words_.insert(string(default_reserved_words[i]));
281  }
282  Token next;
283  if (GetNext(&next)) {
284  token_.push_back(next);
285  }
286  }
287 
288  void ConsumeChar(char c);
289 
290  bool ReadChar(char *c);
291 
299  bool GetNext(Token *next);
300 
303  bool ReservedChar(char c) const {
304  for (size_t i = 0; i < num_reserved_chars_; ++i) {
305  if (c == reserved_chars_[i]) {
306  return true;
307  }
308  }
309  return false;
310  }
311 
312  // data members
313 
314  // The stream itself.
317  istringstream sstream_;
319  istream &is_;
320 
321  // Information about special tokens.
322  char *reserved_chars_;
323  size_t num_reserved_chars_;
324  set<string> reserved_words_;
325 
326  // Information about the current state of the underlying byte stream.
327  size_t num_read_;
328  size_t line_number_;
329  bool eof_reached_;
330  ostringstream oss_;
331 
332  // The sequence of tokens read so far.
333  vector<Token> token_;
334 
335  // The index of the next token in this stream in token_, or token_.size()
336  // if there are no more tokens left in this stream. Note that invocations
337  // of the Rewind and Putback methods alter this data member.
338  size_t next_token_idx_;
339 };
340 
341 } // namespace reranker
342 
343 #endif
A simple class for tokenizing a stream of tokens for the formally specified language used to construc...
string str()
Returns the entire sequence of characters read so far by this stream tokenizer as a newly constructed...
size_t line_number() const
Returns the number of lines read from the underlying byte stream, where a line is any number of bytes...
#define DEFAULT_RESERVED_CHARS
Default set of reserved characters for the StreamTokenizer class.
StreamTokenizer(const string &s, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified string.
size_t PeekTokenStart() const
Returns the next token’s start position, or the byte position of the underlying byte stream if there ...
size_t start
The starting byte of the token in the underlying stream.
void set_reserved_words(set< string > &reserved_words)
Sets the set of “reserved words” used by this stream tokenizer.
static const char * TypeName(TokenType token_type)
Returns a string type name for the specified TokenType constant.
size_t PeekPrevTokenStart() const
TokenType type
The token’s type.
bool HasNext() const
Returns whether there is another token in the token stream.
string Next()
Returns the next token in the token stream.
void Rewind()
Rewinds this token stream to the beginning.
size_t tellg() const
Returns the number of bytes read from the underlying byte stream just after scanning the most recent ...
size_t PeekTokenLineNumber() const
Returns the line number of the first byte of the next token, or the current line number of the underl...
Information about a token read from the underlying stream.
size_t line_number
The line number of the first byte of the token in the underlying stream.
TokenType
The set of types of tokens read by this stream tokenizer.
string Peek() const
Returns the next token that would be returned by the Next method.
TokenType PeekPrevTokenType() const
string tok
The token itself.
virtual ~StreamTokenizer()
Destroys this instance.
size_t curr_pos
The current position in the underlying stream just after reading this token.
StreamTokenizer(istream &is, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified byte stream.
void Putback()
A synonym for Rewind(1).
void Rewind(size_t num_tokens)
Rewinds this token stream by the specified number of tokens.
TokenType PeekTokenType() const
Returns the type of the next token, or EOF_TYPE if there is no next token.