uisrnn
The module for Unbounded Interleaved-State Recurrent Neural Network.
An introduction is available at [README.md].
1# Copyright 2018 Google LLC 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14"""The module for Unbounded Interleaved-State Recurrent Neural Network. 15 16An introduction is available at [README.md]. 17 18[README.md]: https://github.com/google/uis-rnn/blob/master/README.md 19""" 20 21from . import arguments 22from . import evals 23from . import loss_func 24from . import uisrnn 25from . import utils 26 27#pylint: disable=C0103 28parse_arguments = arguments.parse_arguments 29compute_sequence_match_accuracy = evals.compute_sequence_match_accuracy 30output_result = utils.output_result 31UISRNN = uisrnn.UISRNN 32parallel_predict = uisrnn.parallel_predict
31def parse_arguments(): 32 """Parse arguments. 33 34 Returns: 35 A tuple of: 36 37 - `model_args`: model arguments 38 - `training_args`: training arguments 39 - `inference_args`: inference arguments 40 """ 41 # model configurations 42 model_parser = argparse.ArgumentParser( 43 description='Model configurations.', add_help=False) 44 45 model_parser.add_argument( 46 '--observation_dim', 47 default=_DEFAULT_OBSERVATION_DIM, 48 type=int, 49 help='The dimension of the embeddings (e.g. d-vectors).') 50 51 model_parser.add_argument( 52 '--rnn_hidden_size', 53 default=512, 54 type=int, 55 help='The number of nodes for each RNN layer.') 56 model_parser.add_argument( 57 '--rnn_depth', 58 default=1, 59 type=int, 60 help='The number of RNN layers.') 61 model_parser.add_argument( 62 '--rnn_dropout', 63 default=0.2, 64 type=float, 65 help='The dropout rate for all RNN layers.') 66 model_parser.add_argument( 67 '--transition_bias', 68 default=None, 69 type=float, 70 help='The value of p0, corresponding to Eq. (6) in the ' 71 'paper. If the value is given, we will fix to this value. If the ' 72 'value is None, we will estimate it from training data ' 73 'using Eq. (13) in the paper.') 74 model_parser.add_argument( 75 '--crp_alpha', 76 default=1.0, 77 type=float, 78 help='The value of alpha for the Chinese restaurant process (CRP), ' 79 'corresponding to Eq. (7) in the paper. In this open source ' 80 'implementation, currently we only support using a given value ' 81 'of crp_alpha.') 82 model_parser.add_argument( 83 '--sigma2', 84 default=None, 85 type=float, 86 help='The value of sigma squared, corresponding to Eq. (11) in the ' 87 'paper. If the value is given, we will fix to this value. If the ' 88 'value is None, we will estimate it from training data.') 89 model_parser.add_argument( 90 '--verbosity', 91 default=2, 92 type=int, 93 help='How verbose will the logging information be. Higher value ' 94 'represents more verbose information. A general guideline: ' 95 '0 for errors; 1 for finishing important steps; ' 96 '2 for finishing less important steps; 3 or above for debugging ' 97 'information.') 98 model_parser.add_argument( 99 '--enable_cuda', 100 default=True, 101 type=str2bool, 102 help='Whether we should use CUDA if it is avaiable. If False, we will ' 103 'always use CPU.') 104 105 # training configurations 106 training_parser = argparse.ArgumentParser( 107 description='Training configurations.', add_help=False) 108 109 training_parser.add_argument( 110 '--optimizer', 111 '-o', 112 default='adam', 113 choices=['adam'], 114 help='The optimizer for training.') 115 training_parser.add_argument( 116 '--learning_rate', 117 '-l', 118 default=1e-3, 119 type=float, 120 help='The leaning rate for training.') 121 training_parser.add_argument( 122 '--train_iteration', 123 '-t', 124 default=20000, 125 type=int, 126 help='The total number of training iterations.') 127 training_parser.add_argument( 128 '--batch_size', 129 '-b', 130 default=10, 131 type=int, 132 help='The batch size for training.') 133 training_parser.add_argument( 134 '--num_permutations', 135 default=10, 136 type=int, 137 help='The number of permutations per utterance sampled in the training ' 138 'data.') 139 training_parser.add_argument( 140 '--sigma_alpha', 141 default=1.0, 142 type=float, 143 help='The inverse gamma shape for estimating sigma2. This value is only ' 144 'meaningful when sigma2 is not given, and estimated from data.') 145 training_parser.add_argument( 146 '--sigma_beta', 147 default=1.0, 148 type=float, 149 help='The inverse gamma scale for estimating sigma2. This value is only ' 150 'meaningful when sigma2 is not given, and estimated from data.') 151 training_parser.add_argument( 152 '--regularization_weight', 153 '-r', 154 default=1e-5, 155 type=float, 156 help='The network regularization multiplicative.') 157 training_parser.add_argument( 158 '--grad_max_norm', 159 default=5.0, 160 type=float, 161 help='Max norm of the gradient.') 162 training_parser.add_argument( 163 '--enforce_cluster_id_uniqueness', 164 default=True, 165 type=str2bool, 166 help='Whether to enforce cluster ID uniqueness across different ' 167 'training sequences. Only effective when the first input to fit() ' 168 'is a list of sequences. In general, assume the cluster IDs for two ' 169 'sequences are [a, b] and [a, c]. If the `a` from the two sequences ' 170 'are not the same label, then this arg should be True.') 171 172 # inference configurations 173 inference_parser = argparse.ArgumentParser( 174 description='Inference configurations.', add_help=False) 175 176 inference_parser.add_argument( 177 '--beam_size', 178 '-s', 179 default=10, 180 type=int, 181 help='The beam search size for inference.') 182 inference_parser.add_argument( 183 '--look_ahead', 184 default=1, 185 type=int, 186 help='The number of look ahead steps during inference.') 187 inference_parser.add_argument( 188 '--test_iteration', 189 default=2, 190 type=int, 191 help='During inference, we concatenate M duplicates of the test ' 192 'sequence, and run inference on this concatenated sequence. ' 193 'Then we return the inference results on the last duplicate as the ' 194 'final prediction for the test sequence.') 195 196 # a super parser for sanity checks 197 super_parser = argparse.ArgumentParser( 198 parents=[model_parser, training_parser, inference_parser]) 199 200 # get arguments 201 super_parser.parse_args() 202 model_args, _ = model_parser.parse_known_args() 203 training_args, _ = training_parser.parse_known_args() 204 inference_args, _ = inference_parser.parse_known_args() 205 206 return (model_args, training_args, inference_args)
Parse arguments.
Returns: A tuple of:
- `model_args`: model arguments
- `training_args`: training arguments
- `inference_args`: inference arguments
41def compute_sequence_match_accuracy(sequence1, sequence2): 42 """Compute the accuracy between two sequences by finding optimal matching. 43 44 Args: 45 sequence1: A list of integers or strings. 46 sequence2: A list of integers or strings. 47 48 Returns: 49 accuracy: sequence matching accuracy as a number in [0.0, 1.0] 50 51 Raises: 52 TypeError: If sequence1 or sequence2 is not list. 53 ValueError: If sequence1 and sequence2 are not same size. 54 """ 55 if not isinstance(sequence1, list) or not isinstance(sequence2, list): 56 raise TypeError('sequence1 and sequence2 must be lists') 57 if not sequence1 or len(sequence1) != len(sequence2): 58 raise ValueError( 59 'sequence1 and sequence2 must have the same non-zero length') 60 # get unique ids from sequences 61 unique_ids1 = sorted(set(sequence1)) 62 unique_ids2 = sorted(set(sequence2)) 63 inverse_index1 = get_list_inverse_index(unique_ids1) 64 inverse_index2 = get_list_inverse_index(unique_ids2) 65 # get the count matrix 66 count_matrix = np.zeros((len(unique_ids1), len(unique_ids2))) 67 for item1, item2 in zip(sequence1, sequence2): 68 index1 = inverse_index1[item1] 69 index2 = inverse_index2[item2] 70 count_matrix[index1, index2] += 1.0 71 row_index, col_index = optimize.linear_sum_assignment(-count_matrix) 72 optimal_match_count = count_matrix[row_index, col_index].sum() 73 accuracy = optimal_match_count / len(sequence1) 74 return accuracy
Compute the accuracy between two sequences by finding optimal matching.
Args: sequence1: A list of integers or strings. sequence2: A list of integers or strings.
Returns: accuracy: sequence matching accuracy as a number in [0.0, 1.0]
Raises: TypeError: If sequence1 or sequence2 is not list. ValueError: If sequence1 and sequence2 are not same size.
271def output_result(model_args, training_args, test_record): 272 """Produce a string to summarize the experiment.""" 273 accuracy_array, _ = zip(*test_record) 274 total_accuracy = np.mean(accuracy_array) 275 output_string = """ 276Config: 277 sigma_alpha: {} 278 sigma_beta: {} 279 crp_alpha: {} 280 learning rate: {} 281 regularization: {} 282 batch size: {} 283 284Performance: 285 averaged accuracy: {:.6f} 286 accuracy numbers for all testing sequences: 287 """.strip().format( 288 training_args.sigma_alpha, 289 training_args.sigma_beta, 290 model_args.crp_alpha, 291 training_args.learning_rate, 292 training_args.regularization_weight, 293 training_args.batch_size, 294 total_accuracy) 295 for accuracy in accuracy_array: 296 output_string += '\n {:.6f}'.format(accuracy) 297 output_string += '\n' + '=' * 80 + '\n' 298 filename = 'layer_{}_{}_{:.1f}_result.txt'.format( 299 model_args.rnn_hidden_size, 300 model_args.rnn_depth, model_args.rnn_dropout) 301 with open(filename, 'a') as file_object: 302 file_object.write(output_string) 303 return output_string
Produce a string to summarize the experiment.
80class UISRNN: 81 """Unbounded Interleaved-State Recurrent Neural Networks.""" 82 83 def __init__(self, args): 84 """Construct the UISRNN object. 85 86 Args: 87 args: Model configurations. See `arguments.py` for details. 88 """ 89 self.observation_dim = args.observation_dim 90 self.device = torch.device( 91 'cuda:0' if (torch.cuda.is_available() and args.enable_cuda) else 'cpu') 92 self.rnn_model = CoreRNN(self.observation_dim, args.rnn_hidden_size, 93 args.rnn_depth, self.observation_dim, 94 args.rnn_dropout).to(self.device) 95 self.rnn_init_hidden = nn.Parameter( 96 torch.zeros(args.rnn_depth, 1, args.rnn_hidden_size).to(self.device)) 97 # booleans indicating which variables are trainable 98 self.estimate_sigma2 = (args.sigma2 is None) 99 self.estimate_transition_bias = (args.transition_bias is None) 100 # initial values of variables 101 sigma2 = _INITIAL_SIGMA2_VALUE if self.estimate_sigma2 else args.sigma2 102 self.sigma2 = nn.Parameter( 103 sigma2 * torch.ones(self.observation_dim).to(self.device)) 104 self.transition_bias = args.transition_bias 105 self.transition_bias_denominator = 0.0 106 self.crp_alpha = args.crp_alpha 107 self.logger = utils.Logger(args.verbosity) 108 109 def _get_optimizer(self, optimizer, learning_rate): 110 """Get optimizer for UISRNN. 111 112 Args: 113 optimizer: string - name of the optimizer. 114 learning_rate: - learning rate for the entire model. 115 We do not customize learning rate for separate parts. 116 117 Returns: 118 a pytorch "optim" object 119 """ 120 params = [ 121 { 122 'params': self.rnn_model.parameters() 123 }, # rnn parameters 124 { 125 'params': self.rnn_init_hidden 126 } # rnn initial hidden state 127 ] 128 if self.estimate_sigma2: # train sigma2 129 params.append({ 130 'params': self.sigma2 131 }) # variance parameters 132 assert optimizer == 'adam', 'Only adam optimizer is supported.' 133 return optim.Adam(params, lr=learning_rate) 134 135 def save(self, filepath): 136 """Save the model to a file. 137 138 Args: 139 filepath: the path of the file. 140 """ 141 torch.save({ 142 'rnn_state_dict': self.rnn_model.state_dict(), 143 'rnn_init_hidden': self.rnn_init_hidden.detach().cpu().numpy(), 144 'transition_bias': self.transition_bias, 145 'transition_bias_denominator': self.transition_bias_denominator, 146 'crp_alpha': self.crp_alpha, 147 'sigma2': self.sigma2.detach().cpu().numpy()}, filepath) 148 149 def load(self, filepath): 150 """Load the model from a file. 151 152 Args: 153 filepath: the path of the file. 154 """ 155 var_dict = torch.load(filepath) 156 self.rnn_model.load_state_dict(var_dict['rnn_state_dict']) 157 self.rnn_init_hidden = nn.Parameter( 158 torch.from_numpy(var_dict['rnn_init_hidden']).to(self.device)) 159 self.transition_bias = float(var_dict['transition_bias']) 160 self.transition_bias_denominator = float( 161 var_dict['transition_bias_denominator']) 162 self.crp_alpha = float(var_dict['crp_alpha']) 163 self.sigma2 = nn.Parameter( 164 torch.from_numpy(var_dict['sigma2']).to(self.device)) 165 166 self.logger.print( 167 3, 'Loaded model with transition_bias={}, crp_alpha={}, sigma2={}, ' 168 'rnn_init_hidden={}'.format( 169 self.transition_bias, self.crp_alpha, var_dict['sigma2'], 170 var_dict['rnn_init_hidden'])) 171 172 def fit_concatenated(self, train_sequence, train_cluster_id, args): 173 """Fit UISRNN model to concatenated sequence and cluster_id. 174 175 Args: 176 train_sequence: the training observation sequence, which is a 177 2-dim numpy array of real numbers, of size `N * D`. 178 179 - `N`: summation of lengths of all utterances. 180 - `D`: observation dimension. 181 182 For example, 183 ``` 184 train_sequence = 185 [[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa' 186 [0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa' 187 [-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa' 188 [3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb' 189 [1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb' 190 ``` 191 Here `N=5`, `D=4`. 192 193 We concatenate all training utterances into this single sequence. 194 train_cluster_id: the speaker id sequence, which is 1-dim list or 195 numpy array of strings, of size `N`. 196 For example, 197 ``` 198 train_cluster_id = 199 ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0'] 200 ``` 201 'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'. 202 203 Note that the order of entries within an utterance are preserved, 204 and all utterances are simply concatenated together. 205 args: Training configurations. See `arguments.py` for details. 206 207 Raises: 208 TypeError: If train_sequence or train_cluster_id is of wrong type. 209 ValueError: If train_sequence or train_cluster_id has wrong dimension. 210 """ 211 # check type 212 if (not isinstance(train_sequence, np.ndarray) or 213 train_sequence.dtype != float): 214 raise TypeError('train_sequence should be a numpy array of float type.') 215 if isinstance(train_cluster_id, list): 216 train_cluster_id = np.array(train_cluster_id) 217 if (not isinstance(train_cluster_id, np.ndarray) or 218 not train_cluster_id.dtype.name.startswith(('str', 'unicode'))): 219 raise TypeError('train_cluster_id type be a numpy array of strings.') 220 # check dimension 221 if train_sequence.ndim != 2: 222 raise ValueError('train_sequence must be 2-dim array.') 223 if train_cluster_id.ndim != 1: 224 raise ValueError('train_cluster_id must be 1-dim array.') 225 # check length and size 226 train_total_length, observation_dim = train_sequence.shape 227 if observation_dim != self.observation_dim: 228 raise ValueError('train_sequence does not match the dimension specified ' 229 'by args.observation_dim.') 230 if train_total_length != len(train_cluster_id): 231 raise ValueError('train_sequence length is not equal to ' 232 'train_cluster_id length.') 233 234 self.rnn_model.train() 235 optimizer = self._get_optimizer(optimizer=args.optimizer, 236 learning_rate=args.learning_rate) 237 238 sub_sequences, seq_lengths = utils.resize_sequence( 239 sequence=train_sequence, 240 cluster_id=train_cluster_id, 241 num_permutations=args.num_permutations) 242 243 # For batch learning, pack the entire dataset. 244 if args.batch_size is None: 245 packed_train_sequence, rnn_truth = utils.pack_sequence( 246 sub_sequences, 247 seq_lengths, 248 args.batch_size, 249 self.observation_dim, 250 self.device) 251 train_loss = [] 252 for num_iter in range(args.train_iteration): 253 optimizer.zero_grad() 254 # For online learning, pack a subset in each iteration. 255 if args.batch_size is not None: 256 packed_train_sequence, rnn_truth = utils.pack_sequence( 257 sub_sequences, 258 seq_lengths, 259 args.batch_size, 260 self.observation_dim, 261 self.device) 262 hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1) 263 mean, _ = self.rnn_model(packed_train_sequence, hidden) 264 # use mean to predict 265 mean = torch.cumsum(mean, dim=0) 266 mean_size = mean.size() 267 mean = torch.mm( 268 torch.diag( 269 1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)), 270 mean.view(mean_size[0], -1)) 271 mean = mean.view(mean_size) 272 273 # Likelihood part. 274 loss1 = loss_func.weighted_mse_loss( 275 input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :], 276 target_tensor=rnn_truth, 277 weight=1 / (2 * self.sigma2)) 278 279 # Sigma2 prior part. 280 weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth) 281 ** 2).view(-1, observation_dim) 282 num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze() 283 loss2 = loss_func.sigma2_prior_loss( 284 num_non_zero, args.sigma_alpha, args.sigma_beta, self.sigma2) 285 286 # Regularization part. 287 loss3 = loss_func.regularization_loss( 288 self.rnn_model.parameters(), args.regularization_weight) 289 290 loss = loss1 + loss2 + loss3 291 loss.backward() 292 nn.utils.clip_grad_norm_(self.rnn_model.parameters(), args.grad_max_norm) 293 optimizer.step() 294 # avoid numerical issues 295 self.sigma2.data.clamp_(min=1e-6) 296 297 if (np.remainder(num_iter, 10) == 0 or 298 num_iter == args.train_iteration - 1): 299 self.logger.print( 300 2, 301 'Iter: {:d} \t' 302 'Training Loss: {:.4f} \n' 303 ' Negative Log Likelihood: {:.4f}\t' 304 'Sigma2 Prior: {:.4f}\t' 305 'Regularization: {:.4f}'.format( 306 num_iter, 307 float(loss.data), 308 float(loss1.data), 309 float(loss2.data), 310 float(loss3.data))) 311 train_loss.append(float(loss1.data)) # only save the likelihood part 312 self.logger.print( 313 1, 'Done training with {} iterations'.format(args.train_iteration)) 314 315 def fit(self, train_sequences, train_cluster_ids, args): 316 """Fit UISRNN model. 317 318 Args: 319 train_sequences: Either a list of training sequences, or a single 320 concatenated training sequence: 321 322 1. train_sequences is list, and each element is a 2-dim numpy array 323 of real numbers, of size: `length * D`. 324 The length varies among different sequences, but the D is the same. 325 In speaker diarization, each sequence is the sequence of speaker 326 embeddings of one utterance. 327 2. train_sequences is a single concatenated sequence, which is a 328 2-dim numpy array of real numbers. See `fit_concatenated()` 329 for more details. 330 train_cluster_ids: Ground truth labels for train_sequences: 331 332 1. if train_sequences is a list, this must also be a list of the same 333 size, each element being a 1-dim list or numpy array of strings. 334 2. if train_sequences is a single concatenated sequence, this 335 must also be the concatenated 1-dim list or numpy array of strings 336 args: Training configurations. See `arguments.py` for details. 337 338 Raises: 339 TypeError: If train_sequences or train_cluster_ids is of wrong type. 340 """ 341 if isinstance(train_sequences, np.ndarray): 342 # train_sequences is already the concatenated sequence 343 if self.estimate_transition_bias: 344 # see issue #55: https://github.com/google/uis-rnn/issues/55 345 self.logger.print( 346 2, 347 'Warning: transition_bias cannot be correctly estimated from a ' 348 'concatenated sequence; train_sequences will be treated as a ' 349 'single sequence. This can lead to inaccurate estimation of ' 350 'transition_bias. Please, consider estimating transition_bias ' 351 'before concatenating the sequences and passing it as argument.') 352 train_sequences = [train_sequences] 353 train_cluster_ids = [train_cluster_ids] 354 elif isinstance(train_sequences, list): 355 # train_sequences is a list of un-concatenated sequences 356 # we will concatenate it later, after estimating transition_bias 357 pass 358 else: 359 raise TypeError('train_sequences must be a list or numpy.ndarray') 360 361 # estimate transition_bias 362 if self.estimate_transition_bias: 363 (transition_bias, 364 transition_bias_denominator) = utils.estimate_transition_bias( 365 train_cluster_ids) 366 # set or update transition_bias 367 if self.transition_bias is None: 368 self.transition_bias = transition_bias 369 self.transition_bias_denominator = transition_bias_denominator 370 else: 371 self.transition_bias = ( 372 self.transition_bias * self.transition_bias_denominator + 373 transition_bias * transition_bias_denominator) / ( 374 self.transition_bias_denominator + transition_bias_denominator) 375 self.transition_bias_denominator += transition_bias_denominator 376 377 # concatenate train_sequences 378 (concatenated_train_sequence, 379 concatenated_train_cluster_id) = utils.concatenate_training_data( 380 train_sequences, 381 train_cluster_ids, 382 args.enforce_cluster_id_uniqueness, 383 True) 384 385 self.fit_concatenated( 386 concatenated_train_sequence, concatenated_train_cluster_id, args) 387 388 def _update_beam_state(self, beam_state, look_ahead_seq, cluster_seq): 389 """Update a beam state given a look ahead sequence and known cluster 390 assignments. 391 392 Args: 393 beam_state: A BeamState object. 394 look_ahead_seq: Look ahead sequence, size: look_ahead*D. 395 look_ahead: number of step to look ahead in the beam search. 396 D: observation dimension 397 cluster_seq: Cluster assignment sequence for look_ahead_seq. 398 399 Returns: 400 new_beam_state: An updated BeamState object. 401 """ 402 403 loss = 0 404 new_beam_state = BeamState(beam_state) 405 for sub_idx, cluster in enumerate(cluster_seq): 406 if cluster > len(new_beam_state.mean_set): # invalid trace 407 new_beam_state.neg_likelihood = float('inf') 408 break 409 elif cluster < len(new_beam_state.mean_set): # existing cluster 410 last_cluster = new_beam_state.trace[-1] 411 loss = loss_func.weighted_mse_loss( 412 input_tensor=torch.squeeze(new_beam_state.mean_set[cluster]), 413 target_tensor=look_ahead_seq[sub_idx, :], 414 weight=1 / (2 * self.sigma2)).cpu().detach().numpy() 415 if cluster == last_cluster: 416 loss -= np.log(1 - self.transition_bias) 417 else: 418 loss -= np.log(self.transition_bias) + np.log( 419 new_beam_state.block_counts[cluster]) - np.log( 420 sum(new_beam_state.block_counts) + self.crp_alpha) 421 # update new mean and new hidden 422 mean, hidden = self.rnn_model( 423 look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0), 424 new_beam_state.hidden_set[cluster]) 425 new_beam_state.mean_set[cluster] = (new_beam_state.mean_set[cluster]*( 426 (np.array(new_beam_state.trace) == cluster).sum() - 427 1).astype(float) + mean.clone()) / ( 428 np.array(new_beam_state.trace) == cluster).sum().astype( 429 float) # use mean to predict 430 new_beam_state.hidden_set[cluster] = hidden.clone() 431 if cluster != last_cluster: 432 new_beam_state.block_counts[cluster] += 1 433 new_beam_state.trace.append(cluster) 434 else: # new cluster 435 init_input = autograd.Variable( 436 torch.zeros(self.observation_dim) 437 ).unsqueeze(0).unsqueeze(0).to(self.device) 438 mean, hidden = self.rnn_model(init_input, 439 self.rnn_init_hidden) 440 loss = loss_func.weighted_mse_loss( 441 input_tensor=torch.squeeze(mean), 442 target_tensor=look_ahead_seq[sub_idx, :], 443 weight=1 / (2 * self.sigma2)).cpu().detach().numpy() 444 loss -= np.log(self.transition_bias) + np.log( 445 self.crp_alpha) - np.log( 446 sum(new_beam_state.block_counts) + self.crp_alpha) 447 # update new min and new hidden 448 mean, hidden = self.rnn_model( 449 look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0), 450 hidden) 451 new_beam_state.append(mean, hidden, cluster) 452 new_beam_state.neg_likelihood += loss 453 return new_beam_state 454 455 def _calculate_score(self, beam_state, look_ahead_seq): 456 """Calculate negative log likelihoods for all possible state allocations 457 of a look ahead sequence, according to the current beam state. 458 459 Args: 460 beam_state: A BeamState object. 461 look_ahead_seq: Look ahead sequence, size: look_ahead*D. 462 look_ahead: number of step to look ahead in the beam search. 463 D: observation dimension 464 465 Returns: 466 beam_score_set: a set of scores for each possible state allocation. 467 """ 468 469 look_ahead, _ = look_ahead_seq.shape 470 beam_num_clusters = len(beam_state.mean_set) 471 beam_score_set = float('inf') * np.ones( 472 beam_num_clusters + 1 + np.arange(look_ahead)) 473 for cluster_seq, _ in np.ndenumerate(beam_score_set): 474 updated_beam_state = self._update_beam_state(beam_state, 475 look_ahead_seq, cluster_seq) 476 beam_score_set[cluster_seq] = updated_beam_state.neg_likelihood 477 return beam_score_set 478 479 def predict_single(self, test_sequence, args): 480 """Predict labels for a single test sequence using UISRNN model. 481 482 Args: 483 test_sequence: the test observation sequence, which is 2-dim numpy array 484 of real numbers, of size `N * D`. 485 486 - `N`: length of one test utterance. 487 - `D` : observation dimension. 488 489 For example: 490 ``` 491 test_sequence = 492 [[2.2 -1.0 3.0 5.6] --> 1st entry of utterance 'iccc' 493 [0.5 1.8 -3.2 0.4] --> 2nd entry of utterance 'iccc' 494 [-2.2 5.0 1.8 3.7] --> 3rd entry of utterance 'iccc' 495 [-3.8 0.1 1.4 3.3] --> 4th entry of utterance 'iccc' 496 [0.1 2.7 3.5 -1.7]] --> 5th entry of utterance 'iccc' 497 ``` 498 Here `N=5`, `D=4`. 499 args: Inference configurations. See `arguments.py` for details. 500 501 Returns: 502 predicted_cluster_id: predicted speaker id sequence, which is 503 an array of integers, of size `N`. 504 For example, `predicted_cluster_id = [0, 1, 0, 0, 1]` 505 506 Raises: 507 TypeError: If test_sequence is of wrong type. 508 ValueError: If test_sequence has wrong dimension. 509 """ 510 # check type 511 if (not isinstance(test_sequence, np.ndarray) or 512 test_sequence.dtype != float): 513 raise TypeError('test_sequence should be a numpy array of float type.') 514 # check dimension 515 if test_sequence.ndim != 2: 516 raise ValueError('test_sequence must be 2-dim array.') 517 # check size 518 test_sequence_length, observation_dim = test_sequence.shape 519 if observation_dim != self.observation_dim: 520 raise ValueError('test_sequence does not match the dimension specified ' 521 'by args.observation_dim.') 522 523 self.rnn_model.eval() 524 test_sequence = np.tile(test_sequence, (args.test_iteration, 1)) 525 test_sequence = autograd.Variable( 526 torch.from_numpy(test_sequence).float()).to(self.device) 527 # bookkeeping for beam search 528 beam_set = [BeamState()] 529 for num_iter in np.arange(0, args.test_iteration * test_sequence_length, 530 args.look_ahead): 531 max_clusters = max([len(beam_state.mean_set) for beam_state in beam_set]) 532 look_ahead_seq = test_sequence[num_iter: num_iter + args.look_ahead, :] 533 look_ahead_seq_length = look_ahead_seq.shape[0] 534 score_set = float('inf') * np.ones( 535 np.append( 536 args.beam_size, max_clusters + 1 + np.arange( 537 look_ahead_seq_length))) 538 for beam_rank, beam_state in enumerate(beam_set): 539 beam_score_set = self._calculate_score(beam_state, look_ahead_seq) 540 score_set[beam_rank, :] = np.pad( 541 beam_score_set, 542 np.tile([[0, max_clusters - len(beam_state.mean_set)]], 543 (look_ahead_seq_length, 1)), 'constant', 544 constant_values=float('inf')) 545 # find top scores 546 score_ranked = np.sort(score_set, axis=None) 547 score_ranked[score_ranked == float('inf')] = 0 548 score_ranked = np.trim_zeros(score_ranked) 549 idx_ranked = np.argsort(score_set, axis=None) 550 updated_beam_set = [] 551 for new_beam_rank in range( 552 np.min((len(score_ranked), args.beam_size))): 553 total_idx = np.unravel_index(idx_ranked[new_beam_rank], 554 score_set.shape) 555 prev_beam_rank = total_idx[0].item() 556 cluster_seq = total_idx[1:] 557 updated_beam_state = self._update_beam_state( 558 beam_set[prev_beam_rank], look_ahead_seq, cluster_seq) 559 updated_beam_set.append(updated_beam_state) 560 beam_set = updated_beam_set 561 predicted_cluster_id = beam_set[0].trace[-test_sequence_length:] 562 return predicted_cluster_id 563 564 def predict(self, test_sequences, args): 565 """Predict labels for a single or many test sequences using UISRNN model. 566 567 Args: 568 test_sequences: Either a list of test sequences, or a single test 569 sequence. Each test sequence is a 2-dim numpy array 570 of real numbers. See `predict_single()` for details. 571 args: Inference configurations. See `arguments.py` for details. 572 573 Returns: 574 predicted_cluster_ids: Predicted labels for test_sequences. 575 576 1. if test_sequences is a list, predicted_cluster_ids will be a list 577 of the same size, where each element being a 1-dim list of strings. 578 2. if test_sequences is a single sequence, predicted_cluster_ids will 579 be a 1-dim list of strings 580 581 Raises: 582 TypeError: If test_sequences is of wrong type. 583 """ 584 # check type 585 if isinstance(test_sequences, np.ndarray): 586 return self.predict_single(test_sequences, args) 587 if isinstance(test_sequences, list): 588 return [self.predict_single(test_sequence, args) 589 for test_sequence in test_sequences] 590 raise TypeError('test_sequences should be either a list or numpy array.')
Unbounded Interleaved-State Recurrent Neural Networks.
83 def __init__(self, args): 84 """Construct the UISRNN object. 85 86 Args: 87 args: Model configurations. See `arguments.py` for details. 88 """ 89 self.observation_dim = args.observation_dim 90 self.device = torch.device( 91 'cuda:0' if (torch.cuda.is_available() and args.enable_cuda) else 'cpu') 92 self.rnn_model = CoreRNN(self.observation_dim, args.rnn_hidden_size, 93 args.rnn_depth, self.observation_dim, 94 args.rnn_dropout).to(self.device) 95 self.rnn_init_hidden = nn.Parameter( 96 torch.zeros(args.rnn_depth, 1, args.rnn_hidden_size).to(self.device)) 97 # booleans indicating which variables are trainable 98 self.estimate_sigma2 = (args.sigma2 is None) 99 self.estimate_transition_bias = (args.transition_bias is None) 100 # initial values of variables 101 sigma2 = _INITIAL_SIGMA2_VALUE if self.estimate_sigma2 else args.sigma2 102 self.sigma2 = nn.Parameter( 103 sigma2 * torch.ones(self.observation_dim).to(self.device)) 104 self.transition_bias = args.transition_bias 105 self.transition_bias_denominator = 0.0 106 self.crp_alpha = args.crp_alpha 107 self.logger = utils.Logger(args.verbosity)
Construct the UISRNN object.
Args:
args: Model configurations. See arguments.py
for details.
135 def save(self, filepath): 136 """Save the model to a file. 137 138 Args: 139 filepath: the path of the file. 140 """ 141 torch.save({ 142 'rnn_state_dict': self.rnn_model.state_dict(), 143 'rnn_init_hidden': self.rnn_init_hidden.detach().cpu().numpy(), 144 'transition_bias': self.transition_bias, 145 'transition_bias_denominator': self.transition_bias_denominator, 146 'crp_alpha': self.crp_alpha, 147 'sigma2': self.sigma2.detach().cpu().numpy()}, filepath)
Save the model to a file.
Args: filepath: the path of the file.
149 def load(self, filepath): 150 """Load the model from a file. 151 152 Args: 153 filepath: the path of the file. 154 """ 155 var_dict = torch.load(filepath) 156 self.rnn_model.load_state_dict(var_dict['rnn_state_dict']) 157 self.rnn_init_hidden = nn.Parameter( 158 torch.from_numpy(var_dict['rnn_init_hidden']).to(self.device)) 159 self.transition_bias = float(var_dict['transition_bias']) 160 self.transition_bias_denominator = float( 161 var_dict['transition_bias_denominator']) 162 self.crp_alpha = float(var_dict['crp_alpha']) 163 self.sigma2 = nn.Parameter( 164 torch.from_numpy(var_dict['sigma2']).to(self.device)) 165 166 self.logger.print( 167 3, 'Loaded model with transition_bias={}, crp_alpha={}, sigma2={}, ' 168 'rnn_init_hidden={}'.format( 169 self.transition_bias, self.crp_alpha, var_dict['sigma2'], 170 var_dict['rnn_init_hidden']))
Load the model from a file.
Args: filepath: the path of the file.
172 def fit_concatenated(self, train_sequence, train_cluster_id, args): 173 """Fit UISRNN model to concatenated sequence and cluster_id. 174 175 Args: 176 train_sequence: the training observation sequence, which is a 177 2-dim numpy array of real numbers, of size `N * D`. 178 179 - `N`: summation of lengths of all utterances. 180 - `D`: observation dimension. 181 182 For example, 183 ``` 184 train_sequence = 185 [[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa' 186 [0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa' 187 [-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa' 188 [3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb' 189 [1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb' 190 ``` 191 Here `N=5`, `D=4`. 192 193 We concatenate all training utterances into this single sequence. 194 train_cluster_id: the speaker id sequence, which is 1-dim list or 195 numpy array of strings, of size `N`. 196 For example, 197 ``` 198 train_cluster_id = 199 ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0'] 200 ``` 201 'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'. 202 203 Note that the order of entries within an utterance are preserved, 204 and all utterances are simply concatenated together. 205 args: Training configurations. See `arguments.py` for details. 206 207 Raises: 208 TypeError: If train_sequence or train_cluster_id is of wrong type. 209 ValueError: If train_sequence or train_cluster_id has wrong dimension. 210 """ 211 # check type 212 if (not isinstance(train_sequence, np.ndarray) or 213 train_sequence.dtype != float): 214 raise TypeError('train_sequence should be a numpy array of float type.') 215 if isinstance(train_cluster_id, list): 216 train_cluster_id = np.array(train_cluster_id) 217 if (not isinstance(train_cluster_id, np.ndarray) or 218 not train_cluster_id.dtype.name.startswith(('str', 'unicode'))): 219 raise TypeError('train_cluster_id type be a numpy array of strings.') 220 # check dimension 221 if train_sequence.ndim != 2: 222 raise ValueError('train_sequence must be 2-dim array.') 223 if train_cluster_id.ndim != 1: 224 raise ValueError('train_cluster_id must be 1-dim array.') 225 # check length and size 226 train_total_length, observation_dim = train_sequence.shape 227 if observation_dim != self.observation_dim: 228 raise ValueError('train_sequence does not match the dimension specified ' 229 'by args.observation_dim.') 230 if train_total_length != len(train_cluster_id): 231 raise ValueError('train_sequence length is not equal to ' 232 'train_cluster_id length.') 233 234 self.rnn_model.train() 235 optimizer = self._get_optimizer(optimizer=args.optimizer, 236 learning_rate=args.learning_rate) 237 238 sub_sequences, seq_lengths = utils.resize_sequence( 239 sequence=train_sequence, 240 cluster_id=train_cluster_id, 241 num_permutations=args.num_permutations) 242 243 # For batch learning, pack the entire dataset. 244 if args.batch_size is None: 245 packed_train_sequence, rnn_truth = utils.pack_sequence( 246 sub_sequences, 247 seq_lengths, 248 args.batch_size, 249 self.observation_dim, 250 self.device) 251 train_loss = [] 252 for num_iter in range(args.train_iteration): 253 optimizer.zero_grad() 254 # For online learning, pack a subset in each iteration. 255 if args.batch_size is not None: 256 packed_train_sequence, rnn_truth = utils.pack_sequence( 257 sub_sequences, 258 seq_lengths, 259 args.batch_size, 260 self.observation_dim, 261 self.device) 262 hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1) 263 mean, _ = self.rnn_model(packed_train_sequence, hidden) 264 # use mean to predict 265 mean = torch.cumsum(mean, dim=0) 266 mean_size = mean.size() 267 mean = torch.mm( 268 torch.diag( 269 1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)), 270 mean.view(mean_size[0], -1)) 271 mean = mean.view(mean_size) 272 273 # Likelihood part. 274 loss1 = loss_func.weighted_mse_loss( 275 input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :], 276 target_tensor=rnn_truth, 277 weight=1 / (2 * self.sigma2)) 278 279 # Sigma2 prior part. 280 weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth) 281 ** 2).view(-1, observation_dim) 282 num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze() 283 loss2 = loss_func.sigma2_prior_loss( 284 num_non_zero, args.sigma_alpha, args.sigma_beta, self.sigma2) 285 286 # Regularization part. 287 loss3 = loss_func.regularization_loss( 288 self.rnn_model.parameters(), args.regularization_weight) 289 290 loss = loss1 + loss2 + loss3 291 loss.backward() 292 nn.utils.clip_grad_norm_(self.rnn_model.parameters(), args.grad_max_norm) 293 optimizer.step() 294 # avoid numerical issues 295 self.sigma2.data.clamp_(min=1e-6) 296 297 if (np.remainder(num_iter, 10) == 0 or 298 num_iter == args.train_iteration - 1): 299 self.logger.print( 300 2, 301 'Iter: {:d} \t' 302 'Training Loss: {:.4f} \n' 303 ' Negative Log Likelihood: {:.4f}\t' 304 'Sigma2 Prior: {:.4f}\t' 305 'Regularization: {:.4f}'.format( 306 num_iter, 307 float(loss.data), 308 float(loss1.data), 309 float(loss2.data), 310 float(loss3.data))) 311 train_loss.append(float(loss1.data)) # only save the likelihood part 312 self.logger.print( 313 1, 'Done training with {} iterations'.format(args.train_iteration))
Fit UISRNN model to concatenated sequence and cluster_id.
Args:
train_sequence: the training observation sequence, which is a
2-dim numpy array of real numbers, of size N * D
.
- `N`: summation of lengths of all utterances.
- `D`: observation dimension.
For example,
train_sequence =
[[1.2 3.0 -4.1 6.0] --> an entry of speaker #0 from utterance 'iaaa'
[0.8 -1.1 0.4 0.5] --> an entry of speaker #1 from utterance 'iaaa'
[-0.2 1.0 3.8 5.7] --> an entry of speaker #0 from utterance 'iaaa'
[3.8 -0.1 1.5 2.3] --> an entry of speaker #0 from utterance 'ibbb'
[1.2 1.4 3.6 -2.7]] --> an entry of speaker #0 from utterance 'ibbb'
Here `N=5`, `D=4`.
We concatenate all training utterances into this single sequence.
train_cluster_id: the speaker id sequence, which is 1-dim list or
numpy array of strings, of size N
.
For example,
train_cluster_id =
['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0']
'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'.
Note that the order of entries within an utterance are preserved,
and all utterances are simply concatenated together.
args: Training configurations. See arguments.py
for details.
Raises: TypeError: If train_sequence or train_cluster_id is of wrong type. ValueError: If train_sequence or train_cluster_id has wrong dimension.
315 def fit(self, train_sequences, train_cluster_ids, args): 316 """Fit UISRNN model. 317 318 Args: 319 train_sequences: Either a list of training sequences, or a single 320 concatenated training sequence: 321 322 1. train_sequences is list, and each element is a 2-dim numpy array 323 of real numbers, of size: `length * D`. 324 The length varies among different sequences, but the D is the same. 325 In speaker diarization, each sequence is the sequence of speaker 326 embeddings of one utterance. 327 2. train_sequences is a single concatenated sequence, which is a 328 2-dim numpy array of real numbers. See `fit_concatenated()` 329 for more details. 330 train_cluster_ids: Ground truth labels for train_sequences: 331 332 1. if train_sequences is a list, this must also be a list of the same 333 size, each element being a 1-dim list or numpy array of strings. 334 2. if train_sequences is a single concatenated sequence, this 335 must also be the concatenated 1-dim list or numpy array of strings 336 args: Training configurations. See `arguments.py` for details. 337 338 Raises: 339 TypeError: If train_sequences or train_cluster_ids is of wrong type. 340 """ 341 if isinstance(train_sequences, np.ndarray): 342 # train_sequences is already the concatenated sequence 343 if self.estimate_transition_bias: 344 # see issue #55: https://github.com/google/uis-rnn/issues/55 345 self.logger.print( 346 2, 347 'Warning: transition_bias cannot be correctly estimated from a ' 348 'concatenated sequence; train_sequences will be treated as a ' 349 'single sequence. This can lead to inaccurate estimation of ' 350 'transition_bias. Please, consider estimating transition_bias ' 351 'before concatenating the sequences and passing it as argument.') 352 train_sequences = [train_sequences] 353 train_cluster_ids = [train_cluster_ids] 354 elif isinstance(train_sequences, list): 355 # train_sequences is a list of un-concatenated sequences 356 # we will concatenate it later, after estimating transition_bias 357 pass 358 else: 359 raise TypeError('train_sequences must be a list or numpy.ndarray') 360 361 # estimate transition_bias 362 if self.estimate_transition_bias: 363 (transition_bias, 364 transition_bias_denominator) = utils.estimate_transition_bias( 365 train_cluster_ids) 366 # set or update transition_bias 367 if self.transition_bias is None: 368 self.transition_bias = transition_bias 369 self.transition_bias_denominator = transition_bias_denominator 370 else: 371 self.transition_bias = ( 372 self.transition_bias * self.transition_bias_denominator + 373 transition_bias * transition_bias_denominator) / ( 374 self.transition_bias_denominator + transition_bias_denominator) 375 self.transition_bias_denominator += transition_bias_denominator 376 377 # concatenate train_sequences 378 (concatenated_train_sequence, 379 concatenated_train_cluster_id) = utils.concatenate_training_data( 380 train_sequences, 381 train_cluster_ids, 382 args.enforce_cluster_id_uniqueness, 383 True) 384 385 self.fit_concatenated( 386 concatenated_train_sequence, concatenated_train_cluster_id, args)
Fit UISRNN model.
Args: train_sequences: Either a list of training sequences, or a single concatenated training sequence:
1. train_sequences is list, and each element is a 2-dim numpy array
of real numbers, of size: `length * D`.
The length varies among different sequences, but the D is the same.
In speaker diarization, each sequence is the sequence of speaker
embeddings of one utterance.
2. train_sequences is a single concatenated sequence, which is a
2-dim numpy array of real numbers. See `fit_concatenated()`
for more details.
train_cluster_ids: Ground truth labels for train_sequences:
1. if train_sequences is a list, this must also be a list of the same
size, each element being a 1-dim list or numpy array of strings.
2. if train_sequences is a single concatenated sequence, this
must also be the concatenated 1-dim list or numpy array of strings
args: Training configurations. See arguments.py
for details.
Raises: TypeError: If train_sequences or train_cluster_ids is of wrong type.
479 def predict_single(self, test_sequence, args): 480 """Predict labels for a single test sequence using UISRNN model. 481 482 Args: 483 test_sequence: the test observation sequence, which is 2-dim numpy array 484 of real numbers, of size `N * D`. 485 486 - `N`: length of one test utterance. 487 - `D` : observation dimension. 488 489 For example: 490 ``` 491 test_sequence = 492 [[2.2 -1.0 3.0 5.6] --> 1st entry of utterance 'iccc' 493 [0.5 1.8 -3.2 0.4] --> 2nd entry of utterance 'iccc' 494 [-2.2 5.0 1.8 3.7] --> 3rd entry of utterance 'iccc' 495 [-3.8 0.1 1.4 3.3] --> 4th entry of utterance 'iccc' 496 [0.1 2.7 3.5 -1.7]] --> 5th entry of utterance 'iccc' 497 ``` 498 Here `N=5`, `D=4`. 499 args: Inference configurations. See `arguments.py` for details. 500 501 Returns: 502 predicted_cluster_id: predicted speaker id sequence, which is 503 an array of integers, of size `N`. 504 For example, `predicted_cluster_id = [0, 1, 0, 0, 1]` 505 506 Raises: 507 TypeError: If test_sequence is of wrong type. 508 ValueError: If test_sequence has wrong dimension. 509 """ 510 # check type 511 if (not isinstance(test_sequence, np.ndarray) or 512 test_sequence.dtype != float): 513 raise TypeError('test_sequence should be a numpy array of float type.') 514 # check dimension 515 if test_sequence.ndim != 2: 516 raise ValueError('test_sequence must be 2-dim array.') 517 # check size 518 test_sequence_length, observation_dim = test_sequence.shape 519 if observation_dim != self.observation_dim: 520 raise ValueError('test_sequence does not match the dimension specified ' 521 'by args.observation_dim.') 522 523 self.rnn_model.eval() 524 test_sequence = np.tile(test_sequence, (args.test_iteration, 1)) 525 test_sequence = autograd.Variable( 526 torch.from_numpy(test_sequence).float()).to(self.device) 527 # bookkeeping for beam search 528 beam_set = [BeamState()] 529 for num_iter in np.arange(0, args.test_iteration * test_sequence_length, 530 args.look_ahead): 531 max_clusters = max([len(beam_state.mean_set) for beam_state in beam_set]) 532 look_ahead_seq = test_sequence[num_iter: num_iter + args.look_ahead, :] 533 look_ahead_seq_length = look_ahead_seq.shape[0] 534 score_set = float('inf') * np.ones( 535 np.append( 536 args.beam_size, max_clusters + 1 + np.arange( 537 look_ahead_seq_length))) 538 for beam_rank, beam_state in enumerate(beam_set): 539 beam_score_set = self._calculate_score(beam_state, look_ahead_seq) 540 score_set[beam_rank, :] = np.pad( 541 beam_score_set, 542 np.tile([[0, max_clusters - len(beam_state.mean_set)]], 543 (look_ahead_seq_length, 1)), 'constant', 544 constant_values=float('inf')) 545 # find top scores 546 score_ranked = np.sort(score_set, axis=None) 547 score_ranked[score_ranked == float('inf')] = 0 548 score_ranked = np.trim_zeros(score_ranked) 549 idx_ranked = np.argsort(score_set, axis=None) 550 updated_beam_set = [] 551 for new_beam_rank in range( 552 np.min((len(score_ranked), args.beam_size))): 553 total_idx = np.unravel_index(idx_ranked[new_beam_rank], 554 score_set.shape) 555 prev_beam_rank = total_idx[0].item() 556 cluster_seq = total_idx[1:] 557 updated_beam_state = self._update_beam_state( 558 beam_set[prev_beam_rank], look_ahead_seq, cluster_seq) 559 updated_beam_set.append(updated_beam_state) 560 beam_set = updated_beam_set 561 predicted_cluster_id = beam_set[0].trace[-test_sequence_length:] 562 return predicted_cluster_id
Predict labels for a single test sequence using UISRNN model.
Args:
test_sequence: the test observation sequence, which is 2-dim numpy array
of real numbers, of size N * D
.
- `N`: length of one test utterance.
- `D` : observation dimension.
For example:
test_sequence =
[[2.2 -1.0 3.0 5.6] --> 1st entry of utterance 'iccc'
[0.5 1.8 -3.2 0.4] --> 2nd entry of utterance 'iccc'
[-2.2 5.0 1.8 3.7] --> 3rd entry of utterance 'iccc'
[-3.8 0.1 1.4 3.3] --> 4th entry of utterance 'iccc'
[0.1 2.7 3.5 -1.7]] --> 5th entry of utterance 'iccc'
Here N=5
, D=4
.
args: Inference configurations. See arguments.py
for details.
Returns:
predicted_cluster_id: predicted speaker id sequence, which is
an array of integers, of size N
.
For example, predicted_cluster_id = [0, 1, 0, 0, 1]
Raises: TypeError: If test_sequence is of wrong type. ValueError: If test_sequence has wrong dimension.
564 def predict(self, test_sequences, args): 565 """Predict labels for a single or many test sequences using UISRNN model. 566 567 Args: 568 test_sequences: Either a list of test sequences, or a single test 569 sequence. Each test sequence is a 2-dim numpy array 570 of real numbers. See `predict_single()` for details. 571 args: Inference configurations. See `arguments.py` for details. 572 573 Returns: 574 predicted_cluster_ids: Predicted labels for test_sequences. 575 576 1. if test_sequences is a list, predicted_cluster_ids will be a list 577 of the same size, where each element being a 1-dim list of strings. 578 2. if test_sequences is a single sequence, predicted_cluster_ids will 579 be a 1-dim list of strings 580 581 Raises: 582 TypeError: If test_sequences is of wrong type. 583 """ 584 # check type 585 if isinstance(test_sequences, np.ndarray): 586 return self.predict_single(test_sequences, args) 587 if isinstance(test_sequences, list): 588 return [self.predict_single(test_sequence, args) 589 for test_sequence in test_sequences] 590 raise TypeError('test_sequences should be either a list or numpy array.')
Predict labels for a single or many test sequences using UISRNN model.
Args:
test_sequences: Either a list of test sequences, or a single test
sequence. Each test sequence is a 2-dim numpy array
of real numbers. See predict_single()
for details.
args: Inference configurations. See arguments.py
for details.
Returns: predicted_cluster_ids: Predicted labels for test_sequences.
1. if test_sequences is a list, predicted_cluster_ids will be a list
of the same size, where each element being a 1-dim list of strings.
2. if test_sequences is a single sequence, predicted_cluster_ids will
be a 1-dim list of strings
Raises: TypeError: If test_sequences is of wrong type.
593def parallel_predict(model, test_sequences, args, num_processes=4): 594 """Run prediction in parallel using torch.multiprocessing. 595 596 This is a beta feature. It makes prediction slower on CPU. But it's reported 597 that it makes prediction faster on GPU. 598 599 Args: 600 model: instance of UISRNN model 601 test_sequences: a list of test sequences, or a single test 602 sequence. Each test sequence is a 2-dim numpy array 603 of real numbers. See `predict_single()` for details. 604 args: Inference configurations. See `arguments.py` for details. 605 num_processes: number of parallel processes. 606 607 Returns: 608 a list of the same size as test_sequences, where each element 609 being a 1-dim list of strings. 610 611 Raises: 612 TypeError: If test_sequences is of wrong type. 613 """ 614 if not isinstance(test_sequences, list): 615 raise TypeError('test_sequences must be a list.') 616 ctx = multiprocessing.get_context('forkserver') 617 model.rnn_model.share_memory() 618 pool = ctx.Pool(num_processes) 619 results = pool.map( 620 functools.partial(model.predict_single, args=args), 621 test_sequences) 622 pool.close() 623 return results
Run prediction in parallel using torch.multiprocessing.
This is a beta feature. It makes prediction slower on CPU. But it's reported that it makes prediction faster on GPU.
Args:
model: instance of UISRNN model
test_sequences: a list of test sequences, or a single test
sequence. Each test sequence is a 2-dim numpy array
of real numbers. See predict_single()
for details.
args: Inference configurations. See arguments.py
for details.
num_processes: number of parallel processes.
Returns: a list of the same size as test_sequences, where each element being a 1-dim list of strings.
Raises: TypeError: If test_sequences is of wrong type.