InFact
Interpreter and factory for easily creating C++ objects at run-time
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
stream-tokenizer.h
Go to the documentation of this file.
1 // Copyright 2014, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
35 
36 #ifndef INFACT_STREAM_TOKENIZER_H_
37 #define INFACT_STREAM_TOKENIZER_H_
38 
39 #include <iostream>
40 #include <set>
41 #include <sstream>
42 #include <stdexcept>
43 #include <string>
44 #include <string.h>
45 #include <vector>
46 
47 #include "error.h"
48 
49 namespace infact {
50 
51 using std::istream;
52 using std::istringstream;
53 using std::ostringstream;
54 using std::set;
55 using std::streampos;
56 using std::string;
57 using std::vector;
58 using std::cerr;
59 using std::endl;
60 
65 static const char *default_reserved_words[] = {
66  "-",
67  "nullptr",
68  "NULL",
69  "import",
70  "false",
71  "true",
72  "bool",
73  "int",
74  "double",
75  "string",
76  "bool[]",
77  "int[]",
78  "double[]",
79  "string[]",
80 };
81 
83 #define DEFAULT_RESERVED_CHARS "(){},=;/"
84 
91  public:
95  enum TokenType {
102  };
103 
105  static const char *TypeName(TokenType token_type) {
106  static const char *names[] = {
107  "EOF", "RESERVED_CHAR", "RESERVED_WORD", "STRING", "NUMBER", "IDENTIFIER"
108  };
109  return names[token_type];
110  }
111 
113  struct Token {
115  string tok;
118 
119  // The following three fields capture information about the underlying
120  // byte stream at the time this token was read from it.
121 
123  size_t start;
125  size_t line_number;
130  size_t curr_pos;
131  };
132 
139  StreamTokenizer(istream &is,
140  const char *reserved_chars = DEFAULT_RESERVED_CHARS) :
141  is_(is) {
142  Init(reserved_chars);
143  }
144 
151  StreamTokenizer(const string &s,
152  const char *reserved_chars = DEFAULT_RESERVED_CHARS) :
153  sstream_(s), is_(sstream_) {
154  Init(reserved_chars);
155  }
156 
159  void set_reserved_words(set<string> &reserved_words) {
160  reserved_words_ = reserved_words;
161  }
162 
164  virtual ~StreamTokenizer() {
165  delete[] reserved_chars_;
166  }
167 
170  string str() { return oss_.str(); }
171 
175  size_t tellg() const {
176  return HasPrev() ? token_[next_token_idx_ - 1].curr_pos : 0;
177  }
178 
182  size_t line_number() const {
183  return HasNext() ? token_[next_token_idx_].line_number : line_number_;
184  }
185 
189  string line();
190 
195  size_t line_start() {
196  return HasNext() ? token_[next_token_idx_].line_start_pos : 0;
197  }
198 
200  bool HasNext() const { return next_token_idx_ < token_.size(); }
201 
202  bool HasPrev() const { return next_token_idx_ > 0; }
203 
204  string PeekPrev() const {
205  return HasPrev() ? token_[next_token_idx_ - 1].tok : "";
206  }
207 
211  size_t PeekPrevTokenLineStart() const {
212  return HasPrev() ? token_[next_token_idx_ - 1].line_start_pos : 0;
213  }
214 
215  size_t PeekPrevTokenStart() const {
216  return HasPrev() ? token_[next_token_idx_ - 1].start : 0;
217  }
218 
220  return HasPrev() ? token_[next_token_idx_ - 1].type : EOF_TYPE;
221  }
222 
224  string Next() {
225  if (!HasNext()) {
226  Error("invoking StreamTokenizer::Next when HasNext returns false");
227  }
228 
229  size_t curr_token_idx = next_token_idx_;
230 
231  // Try to get the next token of the stream if we're about to run out of
232  // tokens.
233  if (!eof_reached_ && next_token_idx_ + 1 == token_.size()) {
234  Token next;
235  if (GetNext(&next)) {
236  token_.push_back(next);
237  }
238  }
239  // Ensure that we only advance if we haven't already reached token_.size().
240  if (next_token_idx_ < token_.size()) {
241  ++next_token_idx_;
242  }
243 
244  return token_[curr_token_idx].tok;
245  }
246 
249  void Rewind() {
250  next_token_idx_ = 0;
251  }
252 
257  void Rewind(size_t num_tokens) {
258  // Cannot rewind more than the number of tokens read so far.
259  if (num_tokens > next_token_idx_) {
260  num_tokens = next_token_idx_;
261  }
262  next_token_idx_ -= num_tokens;
263  }
264 
266  void Putback() {
267  Rewind(1);
268  }
269 
272  size_t PeekTokenStart() const {
273  return HasNext() ? token_[next_token_idx_].start : num_read_;
274  }
275 
279  return HasNext() ? token_[next_token_idx_].type : EOF_TYPE;
280  }
281 
285  size_t PeekTokenLineNumber() const {
286  return HasNext() ? token_[next_token_idx_].line_number : line_number_;
287  }
288 
292  string Peek() const { return HasNext() ? token_[next_token_idx_].tok : ""; }
293 
294  private:
295  void Init(const char *reserved_chars) {
296  num_reserved_chars_ = strlen(reserved_chars);
297  reserved_chars_ = new char[num_reserved_chars_ + 1];
298  strcpy(reserved_chars_, reserved_chars);
299  int num_reserved_words = sizeof(default_reserved_words)/sizeof(const char*);
300  for (int i = 0; i < num_reserved_words; ++i) {
301  reserved_words_.insert(string(default_reserved_words[i]));
302  }
303  Token next;
304  if (GetNext(&next)) {
305  token_.push_back(next);
306  }
307  }
308 
309  void ConsumeChar(char c);
310 
311  bool ReadChar(char *c);
312 
320  bool GetNext(Token *next);
321 
324  bool ReservedChar(char c) const {
325  for (size_t i = 0; i < num_reserved_chars_; ++i) {
326  if (c == reserved_chars_[i]) {
327  return true;
328  }
329  }
330  return false;
331  }
332 
333  // data members
334 
335  // The stream itself.
338  istringstream sstream_;
340  istream &is_;
341 
342  // Information about special tokens.
343  char *reserved_chars_;
344  size_t num_reserved_chars_;
345  set<string> reserved_words_;
346 
347  // Information about the current state of the underlying byte stream.
348  size_t num_read_ = 0;
349  size_t line_number_ = 0;
350  size_t line_start_pos_ = 0;
351  bool eof_reached_ = false;
352  ostringstream oss_;
353 
354  // The sequence of tokens read so far.
355  vector<Token> token_;
356 
357  // The index of the next token in this stream in token_, or token_.size()
358  // if there are no more tokens left in this stream. Note that invocations
359  // of the Rewind and Putback methods alter this data member.
360  size_t next_token_idx_ = 0;
361 };
362 
363 } // namespace infact
364 
365 #endif
Information about a token read from the underlying stream.
StreamTokenizer(const string &s, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified string.
size_t curr_pos
The current position in the underlying stream just after reading this token.
string Next()
Returns the next token in the token stream.
size_t line_start_pos
The stream position of the start of the line of this token.
size_t PeekPrevTokenStart() const
Provides an error handling function that optionally throws an exception.
size_t PeekPrevTokenLineStart() const
Returns the stream position of the most recent line start of the previous token, or 0 if this stream ...
TokenType PeekTokenType() const
Returns the type of the next token, or EOF_TYPE if there is no next token.
size_t start
The starting byte of the token in the underlying stream.
virtual ~StreamTokenizer()
Destroys this instance.
void Putback()
A synonym for Rewind(1).
size_t line_start()
Returns the stream position of the current line in the underlying byte stream.
string str()
Returns the entire sequence of characters read so far by this stream tokenizer as a newly constructed...
A simple class for tokenizing a stream of tokens for the formally specified language used to construc...
bool HasNext() const
Returns whether there is another token in the token stream.
StreamTokenizer(istream &is, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified byte stream.
static const char * TypeName(TokenType token_type)
Returns a string type name for the specified TokenType constant.
TokenType PeekPrevTokenType() const
size_t PeekTokenStart() const
Returns the next token’s start position, or the byte position of the underlying byte stream if there ...
void Rewind(size_t num_tokens)
Rewinds this token stream by the specified number of tokens.
size_t PeekTokenLineNumber() const
Returns the line number of the first byte of the next token, or the current line number of the underl...
void set_reserved_words(set< string > &reserved_words)
Sets the set of “reserved words” used by this stream tokenizer.
void Error(const std::string &message)
Reports an error encountered during parsing and/or construction of an object.
Definition: error.cc:47
string Peek() const
Returns the next token that would be returned by the Next method.
#define DEFAULT_RESERVED_CHARS
Default set of reserved characters for the StreamTokenizer class.
TokenType
The set of types of tokens read by this stream tokenizer.
string line()
Returns a string consisting of the characters read so far of the current line containing the most rec...
size_t line_number() const
Returns the number of lines read from the underlying byte stream, where a line is any number of bytes...
string tok
The token itself.
size_t line_number
The line number of the first byte of the token in the underlying stream.
void Rewind()
Rewinds this token stream to the beginning.
size_t tellg() const
Returns the number of bytes read from the underlying byte stream just after scanning the most recent ...
TokenType type
The token’s type.