Reranker Framework (ReFr)
Reranking framework for structure prediction and discriminative language modeling
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ngram-feature-extractor.C
Go to the documentation of this file.
1 // Copyright 2012, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // -----------------------------------------------------------------------------
30 //
31 //
35 
37 
38 namespace reranker {
39 
40 using std::stringstream;
41 
43 
44 void
45 NgramExtractor::Extract(const vector<string> &tokens,
46  const int n,
47  const string &prefix,
48  FeatureVector<string,double> &symbolic_features) const {
49  int tokens_len = (int)tokens.size();
50  int last_token_index = tokens_len - 1;
51  for (int i = 1; i < tokens_len; ++i) {
52  int max_prev = i;
53  for (int prev_index = ((i - n + 1) < 0) ? 0 : (i - n + 1);
54  prev_index <= max_prev;
55  ++prev_index) {
56  // No need to output a feature consisting solely of "</s>" token.
57  if (max_prev == last_token_index && max_prev == prev_index) {
58  break;
59  }
60  stringstream symbol_ss;
61  if (prefix.empty()) {
62  symbol_ss << n << "g_ng{";
63  } else {
64  symbol_ss << prefix << "{";
65  }
66  for (int j = prev_index; j <= max_prev; ++j) {
67  symbol_ss << tokens[j] << ((j < max_prev) ? "," : "}");
68  }
69  symbolic_features.IncrementWeight(symbol_ss.str(), 1.0);
70  }
71  }
72 }
73 
74 void
77  symbolic_features) {
78  vector<string> tokens;
79  tokens.push_back("<s>");
80  tokenizer_.Tokenize(candidate.raw_data(), tokens);
81  tokens.push_back("</s>");
82  ngram_extractor_.Extract(tokens, n_, prefix_, symbolic_features);
83 }
84 
85 
86 } // namespace reranker
virtual void ExtractSymbolic(Candidate &candidate, FeatureVector< string, double > &symbolic_features)
Extracts n-gram features according to the n-gram order specified via the Init method.
const string & raw_data() const
Returns the raw data (typically the sentence) for this candidate.
Definition: candidate.H:143
void Tokenize(const string &s, vector< string > &toks, const char *delimiters=" \t") const
Tokenizes the specified string, depositing the results into the specified vector. ...
Definition: tokenizer.H:62
Extracts n-gram features for candidate hypotheses on the fly.
#define REGISTER_FEATURE_EXTRACTOR(TYPE)
Registers the FeatureExtractor with the specified subtype TYPE with the FeatureExtractor Factory...
Extracts n-gram features from an arbitrary vector of string tokens.
A class to represent a candidate in a set of candidates that constitutes a training instance for a re...
Definition: candidate.H:60
void Extract(const vector< string > &tokens, const int n, const string &prefix, FeatureVector< string, double > &symbolic_features) const
Provides the reranker::NgramFeatureExtractor class.
A class to represent a feature vector, where features are represented by unique identifiers, and feature values are represented by the template type.