Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
stream-tokenizer.C
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
36 
37 #include <ctype.h>
38 #include <stdexcept>
39 
40 #include "stream-tokenizer.H"
41 
42 namespace reranker {
43 
44 void
45 StreamTokenizer::ConsumeChar(char c) {
46  oss_ << c;
47  ++num_read_;
48  if (c == '\n') {
49  ++line_number_;
50  }
51 }
52 
53 bool
54 StreamTokenizer::ReadChar(char *c) {
55  (*c) = is_.get();
56  if (!is_.good()) {
57  eof_reached_ = true;
58  return false;
59  } else {
60  ConsumeChar(*c);
61  return true;
62  }
63 }
64 
65 bool
66 StreamTokenizer::GetNext(Token *next) {
67  if (!is_.good()) {
68  eof_reached_ = true;
69  return false;
70  }
71  // Get first character of next token.
72  char c;
73  bool is_whitespace = true;
74  while (is_whitespace) {
75  if (!ReadChar(&c)) {
76  return false;
77  }
78  is_whitespace = isspace(c);
79 
80  // If we find a comment character, then read to the end of the line.
81  if (!is_whitespace && c == '/' && is_.peek() == '/') {
82  while (c != '\n') {
83  if (!ReadChar(&c)) {
84  return false;
85  }
86  }
87  is_whitespace = true;
88  }
89  }
90 
91  // In case we're successful in reading the next token, we fill in
92  // the two stream state data available now.
93  next->start = num_read_ - 1;
94  next->line_number = line_number_;
95 
96  bool next_tok_complete = false;
97  next->tok.clear();
98  if (ReservedChar(c)) {
99  next->tok += c;
100  next_tok_complete = true;
101  next->type = RESERVED_CHAR;
102  } else if (c == '"') {
103  // We've got a string literal, so keep reading characters,
104  // until hitting a non-escaped double quote.
105  streampos string_literal_start_pos = num_read_ - 1;
106  bool found_closing_quote = false;
107  while (is_.good()) {
108  bool success = ReadChar(&c);
109  if (success) {
110  if (c == '"') {
111  found_closing_quote = true;
112  break;
113  } else if (c == '\\') {
114  success = ReadChar(&c);
115  }
116  }
117  if (success) {
118  next->tok += c;
119  }
120  }
121  if (!found_closing_quote) {
122  cerr << "StreamTokenizer: error: could not find closing "
123  << "double quote for string literal beginning at stream index "
124  << string_literal_start_pos
125  << "; partial string literal read: \""
126  << next->tok << endl;
127  throw std::runtime_error("unclosed string literal");
128  }
129  next_tok_complete = true;
130  next->type = STRING;
131  } else {
132  // This is a number, a reserved word or C++ identifier token, so
133  // add first character; the remainder of the token will be handled
134  // in the next block.
135  next->tok += c;
136  next->type = (c == '-' || (c >= '0' && c <= '9')) ? NUMBER : IDENTIFIER;
137  }
138  if (!next_tok_complete) {
139  // The current token is a number, a reserved word or C++
140  // identifier, so we keep reading characters until hitting a
141  // "reserved character", a whitespace character or EOF.
142  bool done = false;
143  while (!done && is_.good()) {
144  // We don't call ReadChar below because the next character might
145  // tell us that the current token has ended (i.e., if it's a
146  // reserved character, a double quote or a whitespace
147  // character).
148  int peek = is_.peek();
149  if (peek != EOF) {
150  char next_char = static_cast<char>(peek);
151  if (ReservedChar(next_char) || next_char == '"' || isspace(next_char)) {
152  // Now that we've finished reading something that is not a
153  // string literal, change its type to be RESERVED_WORD if it
154  // exactly matches something in the set of reserved words.
155  if (reserved_words_.count(next->tok) != 0) {
156  next->type = RESERVED_WORD;
157  }
158  done = true;
159  } else {
160  ReadChar(&c);
161  next->tok += c;
162  }
163  } else {
164  eof_reached_ = true;
165  }
166  }
167  }
168  // We're about to return a successfully read token, so we make sure to record
169  // the stream position at this point in the Token object.
170  next->curr_pos = num_read_;
171 
172  return true;
173 }
174 
175 } // namespace reranker
Provides the StreamTokenizer class.