36 #ifndef RERANKER_STREAM_TOKENIZER_H_
37 #define RERANKER_STREAM_TOKENIZER_H_
50 using std::istringstream;
51 using std::ostringstream;
63 static const char *default_reserved_words[] = {
80 #define DEFAULT_RESERVED_CHARS "(){},=;/"
103 static const char *names[] = {
104 "EOF",
"RESERVED_CHAR",
"RESERVED_WORD",
"STRING",
"NUMBER",
"IDENTIFIER"
106 return names[token_type];
136 is_(is), num_read_(0), line_number_(0), eof_reached_(false),
138 Init(reserved_chars);
149 sstream_(s), is_(sstream_), num_read_(0), line_number_(0),
150 eof_reached_(false), next_token_idx_(0) {
151 Init(reserved_chars);
157 reserved_words_ = reserved_words;
162 delete[] reserved_chars_;
167 string str() {
return oss_.str(); }
173 return HasPrev() ? token_[next_token_idx_ - 1].curr_pos : 0;
180 return HasNext() ? token_[next_token_idx_].line_number : line_number_;
184 bool HasNext()
const {
return next_token_idx_ < token_.size(); }
186 bool HasPrev()
const {
return next_token_idx_ > 0; }
189 return HasPrev() ? token_[next_token_idx_ - 1].tok :
"";
193 return HasPrev() ? token_[next_token_idx_ - 1].start : 0;
204 throw std::runtime_error(
"invoking StreamTokenizer::Next when HasNext "
208 size_t curr_token_idx = next_token_idx_;
212 if (!eof_reached_ && next_token_idx_ + 1 == token_.size()) {
214 if (GetNext(&next)) {
215 token_.push_back(next);
219 if (next_token_idx_ < token_.size()) {
223 return token_[curr_token_idx].tok;
238 if (num_tokens > next_token_idx_) {
239 num_tokens = next_token_idx_;
241 next_token_idx_ -= num_tokens;
252 return HasNext() ? token_[next_token_idx_].start : num_read_;
265 return HasNext() ? token_[next_token_idx_].line_number : line_number_;
271 string Peek()
const {
return HasNext() ? token_[next_token_idx_].tok :
""; }
274 void Init(
const char *reserved_chars) {
275 num_reserved_chars_ = strlen(reserved_chars);
276 reserved_chars_ =
new char[num_reserved_chars_];
277 strcpy(reserved_chars_, reserved_chars);
278 int num_reserved_words =
sizeof(default_reserved_words)/
sizeof(
const char*);
279 for (
int i = 0; i < num_reserved_words; ++i) {
280 reserved_words_.insert(
string(default_reserved_words[i]));
283 if (GetNext(&next)) {
284 token_.push_back(next);
288 void ConsumeChar(
char c);
290 bool ReadChar(
char *c);
299 bool GetNext(Token *next);
303 bool ReservedChar(
char c)
const {
304 for (
size_t i = 0; i < num_reserved_chars_; ++i) {
305 if (c == reserved_chars_[i]) {
317 istringstream sstream_;
322 char *reserved_chars_;
323 size_t num_reserved_chars_;
324 set<string> reserved_words_;
333 vector<Token> token_;
338 size_t next_token_idx_;
A simple class for tokenizing a stream of tokens for the formally specified language used to construc...
string str()
Returns the entire sequence of characters read so far by this stream tokenizer as a newly constructed...
size_t line_number() const
Returns the number of lines read from the underlying byte stream, where a line is any number of bytes...
#define DEFAULT_RESERVED_CHARS
Default set of reserved characters for the StreamTokenizer class.
StreamTokenizer(const string &s, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified string.
size_t PeekTokenStart() const
Returns the next token’s start position, or the byte position of the underlying byte stream if there ...
size_t start
The starting byte of the token in the underlying stream.
void set_reserved_words(set< string > &reserved_words)
Sets the set of “reserved words” used by this stream tokenizer.
static const char * TypeName(TokenType token_type)
Returns a string type name for the specified TokenType constant.
size_t PeekPrevTokenStart() const
TokenType type
The token’s type.
bool HasNext() const
Returns whether there is another token in the token stream.
string Next()
Returns the next token in the token stream.
void Rewind()
Rewinds this token stream to the beginning.
size_t tellg() const
Returns the number of bytes read from the underlying byte stream just after scanning the most recent ...
size_t PeekTokenLineNumber() const
Returns the line number of the first byte of the next token, or the current line number of the underl...
Information about a token read from the underlying stream.
size_t line_number
The line number of the first byte of the token in the underlying stream.
TokenType
The set of types of tokens read by this stream tokenizer.
string Peek() const
Returns the next token that would be returned by the Next method.
TokenType PeekPrevTokenType() const
string tok
The token itself.
virtual ~StreamTokenizer()
Destroys this instance.
size_t curr_pos
The current position in the underlying stream just after reading this token.
StreamTokenizer(istream &is, const char *reserved_chars="(){},=;/")
Constructs a new instance around the specified byte stream.
void Putback()
A synonym for Rewind(1).
void Rewind(size_t num_tokens)
Rewinds this token stream by the specified number of tokens.
TokenType PeekTokenType() const
Returns the type of the next token, or EOF_TYPE if there is no next token.