40 using std::stringstream;
49 int tokens_len = (int)tokens.size();
50 int last_token_index = tokens_len - 1;
51 for (
int i = 1; i < tokens_len; ++i) {
53 for (
int prev_index = ((i - n + 1) < 0) ? 0 : (i - n + 1);
54 prev_index <= max_prev;
57 if (max_prev == last_token_index && max_prev == prev_index) {
60 stringstream symbol_ss;
62 symbol_ss << n <<
"g_ng{";
64 symbol_ss << prefix <<
"{";
66 for (
int j = prev_index; j <= max_prev; ++j) {
67 symbol_ss << tokens[j] << ((j < max_prev) ?
"," :
"}");
69 symbolic_features.IncrementWeight(symbol_ss.str(), 1.0);
78 vector<string> tokens;
79 tokens.push_back(
"<s>");
81 tokens.push_back(
"</s>");
82 ngram_extractor_.
Extract(tokens, n_, prefix_, symbolic_features);
const string & raw_data() const
Returns the raw data (typically the sentence) for this candidate.
void Tokenize(const string &s, vector< string > &toks, const char *delimiters=" \t") const
Tokenizes the specified string, depositing the results into the specified vector. ...
A class to represent a candidate in a set of candidates that constitutes a training instance for a re...
A class to represent a feature vector, where features are represented by unique identifiers, and feature values are represented by the template type.