40 #include <unordered_set> 
   50 using std::unordered_set;
 
   57                                       PerceptronModelDefaultUpdatePredicate)
 
   60                              PerceptronModelDefaultUpdater)
 
   72   initializers.Add(
"name", &name_, required);
 
   73   initializers.Add(
"score_comparator", &score_comparator_);
 
   74   initializers.Add(
"gold_comparator", &gold_comparator_);
 
   75   initializers.Add(
"candidate_set_scorer", &candidate_set_scorer_);
 
   76   initializers.Add(
"update_predicate", &update_predicate_);
 
   77   initializers.Add(
"updater", &updater_);
 
   78   initializers.Add(
"step_size", &step_size_);
 
  107     cerr << 
"Final best raw model: " 
  109          << 
"Final best averaged model: " 
  121         cerr << 
"Training because we have trained only " << num_epochs
 
  122              << 
" epochs but max epochs is " << 
max_epochs() << 
"." << endl;
 
  125         cerr << 
"Stopping training because we have trained " 
  126              << num_epochs << 
" epochs and max epochs is " 
  138         cerr << 
"Training because we have trained " << num_epochs
 
  139              << 
" epochs but min epochs is " << 
min_epochs() << 
"." << endl;
 
  149       cerr << 
"Training because num epochs in decline is " 
  153       cerr << 
"Stopping training because num epochs in decline is " 
  191     int num_training_errors_this_epoch =
 
  193     double percent_training_errors_this_epoch =
 
  194         ((double)num_training_errors_this_epoch / 
time_.
index()) * 100.0;
 
  195     cerr << 
"Epoch " << 
time_.
epoch() << 
": number of training errors: " 
  196          << num_training_errors_this_epoch << 
" (" 
  197          << percent_training_errors_this_epoch << 
"%)" << endl;
 
  210   bool training = 
true;
 
  215       cerr << 
"Time:" << 
time_.
to_string() << 
": need to update because " 
  217            << 
" is not equal to gold index " << example.
gold_index() << endl;
 
  231 PerceptronModel::DefaultUpdatePredicate::NeedToUpdate(
Model *model,
 
  245   unordered_set<int> gold_features;
 
  246   unordered_set<int> best_scoring_features;
 
  251                                                        best_scoring_features);
 
  258     cerr << 
"Updating weights for gold features [";
 
  259     for (unordered_set<int>::const_iterator it = gold_features.begin();
 
  260          it != gold_features.end(); ++it) {
 
  263     cerr << 
"] from\n\t" << example.
GetGold() << endl;
 
  265     cerr << 
"Updating weights for best scoring features [";
 
  266     for (unordered_set<int>::const_iterator it = best_scoring_features.begin();
 
  267          it != best_scoring_features.end(); ++it) {
 
  274   double positive_step = step_size;
 
  277   double negative_step = -step_size;
 
  289   double total_weight = 0.0;
 
  290   double total_weighted_loss = 0.0;
 
  291   double total_oracle_loss = 0.0;
 
  292   double total_baseline_loss = 0.0;
 
  295   bool not_training = 
false;
 
  296   size_t development_test_size = 0;
 
  297   development_test.
Reset();
 
  298   while (development_test.
HasNext()) {
 
  299     ++development_test_size;
 
  307     total_weight += loss_weight;
 
  309     total_oracle_loss += loss_weight * candidate_set.
GetGold().
loss();
 
  312     total_baseline_loss += loss_weight * candidate_set.
Get(0).
loss();
 
  318   double loss_this_epoch = total_weighted_loss / total_weight;
 
  321   int num_testing_errors_this_epoch =
 
  323   double percent_testing_errors_this_epoch =
 
  324       ((double)num_testing_errors_this_epoch / development_test_size) * 100.0;
 
  325   double oracle_loss = total_oracle_loss / total_weight;
 
  326   double baseline_loss = total_baseline_loss / total_weight;
 
  327   cerr << 
"Epoch " << 
time_.
epoch() << 
": oracle loss: " << oracle_loss << endl;
 
  328   cerr << 
"Epoch " << 
time_.
epoch() << 
": baseline loss: " << baseline_loss << endl;
 
  329   cerr << 
"Epoch " << 
time_.
epoch() << 
": average devtest loss: " 
  330        << loss_this_epoch << endl;
 
  331   cerr << 
"Epoch " << 
time_.
epoch() << 
": number of testing errors: " 
  332        << num_testing_errors_this_epoch << 
" (" 
  333        << percent_testing_errors_this_epoch << 
"%)" << endl;
 
  350   return loss_this_epoch;
 
  360   bool use_raw = training;
 
  365          << candidate << 
" with " << (use_raw ? 
"raw" : 
"avg")
 
  366          << 
" model: " << model << endl
 
  367          << 
"\tscore: " << score << endl;
 
  377   unordered_set<int> old_uids;
 
  380   unordered_map<int, int> old_to_new_uids;
 
  382   for (unordered_set<int>::const_iterator it = old_uids.begin();
 
  383        it != old_uids.end();
 
  385     old_to_new_uids[*it] = new_uid++;
 
  394          it != old_symbols->
end();
 
  396       unordered_map<int, int>::const_iterator old_to_new_uid_it =
 
  397           old_to_new_uids.find(it->second);
 
  398       if (old_to_new_uid_it != old_to_new_uids.end()) {
 
  399         int new_uid = old_to_new_uid_it->second;
 
  400         const string &symbol = it->first;
 
  411                                          gold_features_to_update,
 
  413                                          best_scoring_features_to_update)
 
  422     cerr << 
"Gold index: " << example.
gold_index()
 
  425     cerr << 
"Original gold features: " << gold_features << endl
 
  426          << 
"Original best scoring features: " << best_scoring_features << endl;
 
  431                                             gold_features_to_update);
 
  435     for (unordered_set<int>::const_iterator it =
 
  436              gold_features_to_update.begin();
 
  437          it != gold_features_to_update.end();
 
  446                                     best_scoring_features_to_update);
 
  448     cerr << 
"Time:" << 
time_.
to_string() << 
": new best scoring features: [";
 
  449     for (unordered_set<int>::const_iterator it =
 
  450              best_scoring_features_to_update.begin();
 
  451          it != best_scoring_features_to_update.end();
 
void UpdateAllFeatureAverages(const Time &time)
 
virtual void EndOfEpoch()
 
void NewEpoch()
Increments the epoch counter. 
 
Provides the reranker::PerceptronModel reranker class. 
 
const Candidate & GetGold() const 
 
virtual double ScoreCandidate(Candidate &candidate, bool training)
Scores a candidate according to either the raw or averaged version of this perceptron model...
 
Model is an interface for reranking models. 
 
An interface specifying iteration over CandidateSet instances, using Java-style semantics (sorry...
 
unordered_set< K > & RemoveEqualFeatures(const FeatureVector< K, V > &other, unordered_set< K > &set) const 
Removes from the specified set the uid's of feature with weights equal in this vector to their weight...
 
TrainingVectorSet best_models_
The best models seen so far during training, according to evaluation on the held-out development test...
 
void RemapFeatureUids(const unordered_map< int, int > &old_to_new_uids)
 
virtual Symbols * Clone() const =0
Creates a newly-constructed clone of this Symbols instance that has the same runtime type...
 
#define REGISTER_MODEL(TYPE)
Registers the Model  implementation with the specified subtype TYPE with the Model  Factory...
 
const Candidate & GetBestScoring() const 
 
virtual void Clear()=0
Clears all symbols from this symbol table. 
 
bool CompileFeatures(Symbols *symbols, bool clear_features=false, bool clear_symbolic_features=true, bool force=false)
Compiles any symbolic features in this candidate set. 
 
virtual const_iterator end()=0
 
vector< double > loss_per_epoch_
The average loss per epoch. 
 
TrainingVectorSet models_
The feature vectors representing this model. 
 
int num_epochs_in_decline_
The current number of training epochs in which the model has been degrading in development set perfor...
 
virtual void CompactifyFeatureUids()
Renumbers the potentially sparse feature uid’s so that they occupy the interval [0,n-1] densely, for n non-zero features in use by this model. 
 
shared_ptr< CandidateSet::Scorer > candidate_set_scorer_
A scorer for CandidateSet instances. 
 
virtual void Do(Model *model)=0
The function to be executed by the Model that wraps this hook. 
 
This class implements a perceptron model reranker. 
 
virtual void TrainOnExample(CandidateSet &example)
Trains this model on the specified training example. 
 
virtual void TrainOneEpoch(CandidateSetIterator &examples)
Trains this model for one epoch, i.e., a single pass through the specified set of training examples...
 
Symbols * symbols_
The symbol table for this model (may be NULL). 
 
double loss_weight() const 
Returns the weight of the loss for this candidate set’s reference. 
 
virtual double Evaluate(CandidateSetIterator &development_test)
Evaluates this model on the specified set of held-out development test data. 
 
virtual bool HasNext() const =0
Returns whether this iterator contains another CandidateSet. 
 
vector< int > num_testing_errors_per_epoch_
The number of testing errors made on held-out development test data for each epoch. 
 
virtual double Apply(const FeatureVector< int, double > &fv1, const FeatureVector< int, double > &fv2)
Applies this kernel function to the specified feature vectors. 
 
#define REGISTER_NAMED_MODEL_UPDATE_PREDICATE(TYPE, NAME)
Registers the Model::UpdatePredicate  implementation with the specified subtype TYPE and NAME with th...
 
unordered_map< string, int >::const_iterator const_iterator
 
A class to construct a PerceptronModel from a ModelMessage instance. 
 
Time time_
The tiny object that holds the "training time" for this model (epoch, index and absolute time index)...
 
virtual void Reset()=0
Resets this iterator back to the beginning of its backing collection. 
 
const FeatureVector< int, double > & average_weights() const 
Returns the feature vector corresponding to the averaged perceptron. 
 
vector< int > num_training_errors_per_epoch_
The number of errors made on training examples during each epoch. 
 
A class to hold a set of candidates, either for training or test. 
 
double absolute_seconds() const 
 
int max_epochs() const 
Returns the maximum number of epochs to train. 
 
#define REGISTER_NAMED_MODEL_UPDATER(TYPE, NAME)
Registers the Model::Updater  implementation with the specified subtype TYPE and NAME with the Model:...
 
KernelFunction * kernel_fn_
Yes, this is an interface, but we add the kernel function as a data member. 
 
An interface specifying a converter from symbols (strings) to int indices. 
 
virtual bool use_weighted_loss()
 
double seconds_since_last_epoch() const 
 
const Time & time() const 
Returns the current training time of this model: number of epochs, number of time steps in the curren...
 
An interface for an environment in which variables of various types are mapped to their values...
 
A class to represent a candidate in a set of candidates that constitutes a training instance for a re...
 
int num_training_errors_
The number of errors made on training examples. 
 
void Tick()
Increments both the time index for the current epoch and the absolute time index. ...
 
void set_score(double score)
Sets the score of this candidate. 
 
virtual void ScoreCandidates(CandidateSet &candidates, bool training)
Scores the specified set of candidates according to either the raw or averaged version of this percep...
 
virtual void ComputeFeaturesToUpdate(const CandidateSet &example, unordered_set< int > &gold_features_to_update, unordered_set< int > &best_scoring_features_to_update) const 
Computes the features to be updated for the gold candidate and the best-scoring candidate. 
 
const FeatureVector< int, double > & weights() const 
Returns the "raw" feature weights computed during training. 
 
size_t best_scoring_index() const 
 
Candidate & Get(size_t idx)
 
double loss() const 
Returns the loss of this candidate. 
 
virtual void Update(CandidateSet &example)
Updates the current model based on the specified set of candidates. 
 
const FeatureVector< int, double > & GetModel(bool raw) const 
Returns either the raw or averaged feature vector, depending on the argument. 
 
int epoch() const 
Returns the index of the current epoch. 
 
A class to construct a ModelMessage from a PerceptronModel instance. 
 
virtual void Init(const Environment *env, const string &arg)
Initializes this instance. 
 
virtual void Train(CandidateSetIterator &examples, CandidateSetIterator &development_test)
Trains this model on a collection of training examples, where each training example is a set of candi...
 
virtual double ComputeStepSize(const unordered_set< int > &gold_features, const unordered_set< int > &best_scoring_features, const CandidateSet &example)
Computes the step size for the next update, and, as a side effect, caches this value in step_size_...
 
shared_ptr< UpdatePredicate > update_predicate_
The update predicate for this model. 
 
int index() const 
Returns the index of the current training example within the current epoch. 
 
int min_epochs() const 
Returns the minimum number of epochs to train. 
 
void UpdateGoldAndCandidateFeatureAverages(const Time &time, const Collection &gold_feature_uids, const Collection &candidate_feature_uids)
Updates the feature averages the specified pair of feature uid collections, one for a gold reference ...
 
unordered_set< K > & GetNonZeroFeatures(unordered_set< K > &set) const 
Inserts the uid's of features with non-zero weights into the specified set. 
 
int best_model_epoch_
The epoch of the best models seen so far during training. 
 
void UpdateWeights(const Time &time, const Collection &feature_uids, const FeatureVector< int, double > &feature_vector, double scalar)
Increments the weights for the specified collection of features. 
 
virtual const_iterator begin()=0
 
int num_updates_
The number of times an update was performed on this model during training. 
 
int max_epochs_in_decline_
The maximum number of training epochs to keep training after the model starts to degrade (i...
 
virtual void SetIndex(const string &symbol, int index)=0
 
virtual bool NeedToKeepTraining()
Returns whether more training epochs are required for this model. 
 
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector. 
 
virtual CandidateSet & Next()=0
Returns the next CandidateSet. 
 
virtual bool NeedToUpdate(CandidateSet &example)
Indicates whether the current model needs to be updated; the implementation here simply returns true ...
 
shared_ptr< Updater > updater_
The updater for this model. 
 
Hook * end_of_epoch_hook_
A hook to be performed at the end of every epoch. 
 
A container for all the member initializers for a particular Factory-constructible instance...
 
Provides the reranker::Time class, which holds the three notions of training time: current epoch...
 
size_t gold_index() const 
 
const FeatureVector< int, double > & features() const 
Returns the feature vector for this candidate.