Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tokenizer.H
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
34 
35 #ifndef RERANKER_TOKENIZER_H_
36 #define RERANKER_TOKENIZER_H_
37 
38 #include <string>
39 #include <vector>
40 
41 namespace reranker {
42 
43 #define SPACE_CHARS " \t"
44 
45 using std::string;
46 using std::vector;
47 
49 class Tokenizer {
50  public:
62  void Tokenize(const string &s, vector<string> &toks,
63  const char *delimiters = " \t") const {
64  size_t end_pos = 0;
65  size_t begin_pos = 0;
66  while (begin_pos != string::npos) {
67  begin_pos = s.find_first_not_of(delimiters, end_pos);
68  end_pos = s.find_first_of(delimiters, begin_pos);
69  if (end_pos == string::npos) {
70  end_pos = s.length();
71  }
72  if (begin_pos != string::npos) {
73  toks.push_back(s.substr(begin_pos, end_pos - begin_pos));
74  begin_pos = end_pos;
75  }
76  }
77  }
78 
97  bool ParseSpecString(const string &spec,
98  string &class_name,
99  string &init_string,
100  bool &error) {
101  error = false;
102  size_t first_non_ws_idx = spec.find_first_not_of(SPACE_CHARS);
103  if (first_non_ws_idx == string::npos) {
104  // line is entirely ws
105  return false;
106  }
107 
108  size_t first_paren_idx = spec.find_first_of("(", first_non_ws_idx + 1);
109  size_t last_paren_idx = spec.find_last_of(")");
110 
111  if (first_paren_idx == string::npos ||
112  last_paren_idx == string::npos ||
113  last_paren_idx < first_paren_idx) {
114  error = true;
115  return false;
116  }
117  class_name = spec.substr(first_non_ws_idx,
118  first_paren_idx - first_non_ws_idx);
119  size_t init_string_start_idx = first_paren_idx + 1;
120  size_t init_string_len = last_paren_idx - init_string_start_idx;
121  init_string = spec.substr(init_string_start_idx, init_string_len);
122  return true;
123  }
124 };
125 
126 } // namespace reranker
127 
128 #endif
void Tokenize(const string &s, vector< string > &toks, const char *delimiters=" \t") const
Tokenizes the specified string, depositing the results into the specified vector. ...
Definition: tokenizer.H:62
A very simple tokenizer class.
Definition: tokenizer.H:49
#define SPACE_CHARS
Definition: tokenizer.H:43
bool ParseSpecString(const string &spec, string &class_name, string &init_string, bool &error)
Parses a specification string of the form "ClassName(init_string)", depositing the results into the s...
Definition: tokenizer.H:97