uisrnn

The module for Unbounded Interleaved-State Recurrent Neural Network.

An introduction is available at [README.md].

 1# Copyright 2018 Google LLC
 2#
 3# Licensed under the Apache License, Version 2.0 (the "License");
 4# you may not use this file except in compliance with the License.
 5# You may obtain a copy of the License at
 6#
 7#     https://www.apache.org/licenses/LICENSE-2.0
 8#
 9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""The module for Unbounded Interleaved-State Recurrent Neural Network.
15
16An introduction is available at [README.md].
17
18[README.md]: https://github.com/google/uis-rnn/blob/master/README.md
19"""
20
21from . import arguments
22from . import evals
23from . import loss_func
24from . import uisrnn
25from . import utils
26
27#pylint: disable=C0103
28parse_arguments = arguments.parse_arguments
29compute_sequence_match_accuracy = evals.compute_sequence_match_accuracy
30output_result = utils.output_result
31UISRNN = uisrnn.UISRNN
32parallel_predict = uisrnn.parallel_predict
def parse_arguments():
 31def parse_arguments():
 32  """Parse arguments.
 33
 34  Returns:
 35    A tuple of:
 36
 37      - `model_args`: model arguments
 38      - `training_args`: training arguments
 39      - `inference_args`: inference arguments
 40  """
 41  # model configurations
 42  model_parser = argparse.ArgumentParser(
 43      description='Model configurations.', add_help=False)
 44
 45  model_parser.add_argument(
 46      '--observation_dim',
 47      default=_DEFAULT_OBSERVATION_DIM,
 48      type=int,
 49      help='The dimension of the embeddings (e.g. d-vectors).')
 50
 51  model_parser.add_argument(
 52      '--rnn_hidden_size',
 53      default=512,
 54      type=int,
 55      help='The number of nodes for each RNN layer.')
 56  model_parser.add_argument(
 57      '--rnn_depth',
 58      default=1,
 59      type=int,
 60      help='The number of RNN layers.')
 61  model_parser.add_argument(
 62      '--rnn_dropout',
 63      default=0.2,
 64      type=float,
 65      help='The dropout rate for all RNN layers.')
 66  model_parser.add_argument(
 67      '--transition_bias',
 68      default=None,
 69      type=float,
 70      help='The value of p0, corresponding to Eq. (6) in the '
 71           'paper. If the value is given, we will fix to this value. If the '
 72           'value is None, we will estimate it from training data '
 73           'using Eq. (13) in the paper.')
 74  model_parser.add_argument(
 75      '--crp_alpha',
 76      default=1.0,
 77      type=float,
 78      help='The value of alpha for the Chinese restaurant process (CRP), '
 79           'corresponding to Eq. (7) in the paper. In this open source '
 80           'implementation, currently we only support using a given value '
 81           'of crp_alpha.')
 82  model_parser.add_argument(
 83      '--sigma2',
 84      default=None,
 85      type=float,
 86      help='The value of sigma squared, corresponding to Eq. (11) in the '
 87           'paper. If the value is given, we will fix to this value. If the '
 88           'value is None, we will estimate it from training data.')
 89  model_parser.add_argument(
 90      '--verbosity',
 91      default=2,
 92      type=int,
 93      help='How verbose will the logging information be. Higher value '
 94      'represents more verbose information. A general guideline: '
 95      '0 for errors; 1 for finishing important steps; '
 96      '2 for finishing less important steps; 3 or above for debugging '
 97      'information.')
 98  model_parser.add_argument(
 99      '--enable_cuda',
100      default=True,
101      type=str2bool,
102      help='Whether we should use CUDA if it is avaiable. If False, we will '
103      'always use CPU.')
104
105  # training configurations
106  training_parser = argparse.ArgumentParser(
107      description='Training configurations.', add_help=False)
108
109  training_parser.add_argument(
110      '--optimizer',
111      '-o',
112      default='adam',
113      choices=['adam'],
114      help='The optimizer for training.')
115  training_parser.add_argument(
116      '--learning_rate',
117      '-l',
118      default=1e-3,
119      type=float,
120      help='The leaning rate for training.')
121  training_parser.add_argument(
122      '--train_iteration',
123      '-t',
124      default=20000,
125      type=int,
126      help='The total number of training iterations.')
127  training_parser.add_argument(
128      '--batch_size',
129      '-b',
130      default=10,
131      type=int,
132      help='The batch size for training.')
133  training_parser.add_argument(
134      '--num_permutations',
135      default=10,
136      type=int,
137      help='The number of permutations per utterance sampled in the training '
138           'data.')
139  training_parser.add_argument(
140      '--sigma_alpha',
141      default=1.0,
142      type=float,
143      help='The inverse gamma shape for estimating sigma2. This value is only '
144           'meaningful when sigma2 is not given, and estimated from data.')
145  training_parser.add_argument(
146      '--sigma_beta',
147      default=1.0,
148      type=float,
149      help='The inverse gamma scale for estimating sigma2. This value is only '
150           'meaningful when sigma2 is not given, and estimated from data.')
151  training_parser.add_argument(
152      '--regularization_weight',
153      '-r',
154      default=1e-5,
155      type=float,
156      help='The network regularization multiplicative.')
157  training_parser.add_argument(
158      '--grad_max_norm',
159      default=5.0,
160      type=float,
161      help='Max norm of the gradient.')
162  training_parser.add_argument(
163      '--enforce_cluster_id_uniqueness',
164      default=True,
165      type=str2bool,
166      help='Whether to enforce cluster ID uniqueness across different '
167           'training sequences. Only effective when the first input to fit() '
168           'is a list of sequences. In general, assume the cluster IDs for two '
169           'sequences are [a, b] and [a, c]. If the `a` from the two sequences '
170           'are not the same label, then this arg should be True.')
171
172  # inference configurations
173  inference_parser = argparse.ArgumentParser(
174      description='Inference configurations.', add_help=False)
175
176  inference_parser.add_argument(
177      '--beam_size',
178      '-s',
179      default=10,
180      type=int,
181      help='The beam search size for inference.')
182  inference_parser.add_argument(
183      '--look_ahead',
184      default=1,
185      type=int,
186      help='The number of look ahead steps during inference.')
187  inference_parser.add_argument(
188      '--test_iteration',
189      default=2,
190      type=int,
191      help='During inference, we concatenate M duplicates of the test '
192           'sequence, and run inference on this concatenated sequence. '
193           'Then we return the inference results on the last duplicate as the '
194           'final prediction for the test sequence.')
195
196  # a super parser for sanity checks
197  super_parser = argparse.ArgumentParser(
198      parents=[model_parser, training_parser, inference_parser])
199
200  # get arguments
201  super_parser.parse_args()
202  model_args, _ = model_parser.parse_known_args()
203  training_args, _ = training_parser.parse_known_args()
204  inference_args, _ = inference_parser.parse_known_args()
205
206  return (model_args, training_args, inference_args)

Parse arguments.

Returns: A tuple of:

- `model_args`: model arguments
- `training_args`: training arguments
- `inference_args`: inference arguments
def compute_sequence_match_accuracy(sequence1, sequence2):
41def compute_sequence_match_accuracy(sequence1, sequence2):
42  """Compute the accuracy between two sequences by finding optimal matching.
43
44  Args:
45    sequence1: A list of integers or strings.
46    sequence2: A list of integers or strings.
47
48  Returns:
49    accuracy: sequence matching accuracy as a number in [0.0, 1.0]
50
51  Raises:
52    TypeError: If sequence1 or sequence2 is not list.
53    ValueError: If sequence1 and sequence2 are not same size.
54  """
55  if not isinstance(sequence1, list) or not isinstance(sequence2, list):
56    raise TypeError('sequence1 and sequence2 must be lists')
57  if not sequence1 or len(sequence1) != len(sequence2):
58    raise ValueError(
59        'sequence1 and sequence2 must have the same non-zero length')
60  # get unique ids from sequences
61  unique_ids1 = sorted(set(sequence1))
62  unique_ids2 = sorted(set(sequence2))
63  inverse_index1 = get_list_inverse_index(unique_ids1)
64  inverse_index2 = get_list_inverse_index(unique_ids2)
65  # get the count matrix
66  count_matrix = np.zeros((len(unique_ids1), len(unique_ids2)))
67  for item1, item2 in zip(sequence1, sequence2):
68    index1 = inverse_index1[item1]
69    index2 = inverse_index2[item2]
70    count_matrix[index1, index2] += 1.0
71  row_index, col_index = optimize.linear_sum_assignment(-count_matrix)
72  optimal_match_count = count_matrix[row_index, col_index].sum()
73  accuracy = optimal_match_count / len(sequence1)
74  return accuracy

Compute the accuracy between two sequences by finding optimal matching.

Args: sequence1: A list of integers or strings. sequence2: A list of integers or strings.

Returns: accuracy: sequence matching accuracy as a number in [0.0, 1.0]

Raises: TypeError: If sequence1 or sequence2 is not list. ValueError: If sequence1 and sequence2 are not same size.

def output_result(model_args, training_args, test_record):
271def output_result(model_args, training_args, test_record):
272  """Produce a string to summarize the experiment."""
273  accuracy_array, _ = zip(*test_record)
274  total_accuracy = np.mean(accuracy_array)
275  output_string = """
276Config:
277  sigma_alpha: {}
278  sigma_beta: {}
279  crp_alpha: {}
280  learning rate: {}
281  regularization: {}
282  batch size: {}
283
284Performance:
285  averaged accuracy: {:.6f}
286  accuracy numbers for all testing sequences:
287  """.strip().format(
288      training_args.sigma_alpha,
289      training_args.sigma_beta,
290      model_args.crp_alpha,
291      training_args.learning_rate,
292      training_args.regularization_weight,
293      training_args.batch_size,
294      total_accuracy)
295  for accuracy in accuracy_array:
296    output_string += '\n    {:.6f}'.format(accuracy)
297  output_string += '\n' + '=' * 80 + '\n'
298  filename = 'layer_{}_{}_{:.1f}_result.txt'.format(
299      model_args.rnn_hidden_size,
300      model_args.rnn_depth, model_args.rnn_dropout)
301  with open(filename, 'a') as file_object:
302    file_object.write(output_string)
303  return output_string

Produce a string to summarize the experiment.

class UISRNN:
 80class UISRNN:
 81  """Unbounded Interleaved-State Recurrent Neural Networks."""
 82
 83  def __init__(self, args):
 84    """Construct the UISRNN object.
 85
 86    Args:
 87      args: Model configurations. See `arguments.py` for details.
 88    """
 89    self.observation_dim = args.observation_dim
 90    self.device = torch.device(
 91        'cuda:0' if (torch.cuda.is_available() and args.enable_cuda) else 'cpu')
 92    self.rnn_model = CoreRNN(self.observation_dim, args.rnn_hidden_size,
 93                             args.rnn_depth, self.observation_dim,
 94                             args.rnn_dropout).to(self.device)
 95    self.rnn_init_hidden = nn.Parameter(
 96        torch.zeros(args.rnn_depth, 1, args.rnn_hidden_size).to(self.device))
 97    # booleans indicating which variables are trainable
 98    self.estimate_sigma2 = (args.sigma2 is None)
 99    self.estimate_transition_bias = (args.transition_bias is None)
100    # initial values of variables
101    sigma2 = _INITIAL_SIGMA2_VALUE if self.estimate_sigma2 else args.sigma2
102    self.sigma2 = nn.Parameter(
103        sigma2 * torch.ones(self.observation_dim).to(self.device))
104    self.transition_bias = args.transition_bias
105    self.transition_bias_denominator = 0.0
106    self.crp_alpha = args.crp_alpha
107    self.logger = utils.Logger(args.verbosity)
108
109  def _get_optimizer(self, optimizer, learning_rate):
110    """Get optimizer for UISRNN.
111
112    Args:
113      optimizer: string - name of the optimizer.
114      learning_rate: - learning rate for the entire model.
115        We do not customize learning rate for separate parts.
116
117    Returns:
118      a pytorch "optim" object
119    """
120    params = [
121        {
122            'params': self.rnn_model.parameters()
123        },  # rnn parameters
124        {
125            'params': self.rnn_init_hidden
126        }  # rnn initial hidden state
127    ]
128    if self.estimate_sigma2:  # train sigma2
129      params.append({
130          'params': self.sigma2
131      })  # variance parameters
132    assert optimizer == 'adam', 'Only adam optimizer is supported.'
133    return optim.Adam(params, lr=learning_rate)
134
135  def save(self, filepath):
136    """Save the model to a file.
137
138    Args:
139      filepath: the path of the file.
140    """
141    torch.save({
142        'rnn_state_dict': self.rnn_model.state_dict(),
143        'rnn_init_hidden': self.rnn_init_hidden.detach().cpu().numpy(),
144        'transition_bias': self.transition_bias,
145        'transition_bias_denominator': self.transition_bias_denominator,
146        'crp_alpha': self.crp_alpha,
147        'sigma2': self.sigma2.detach().cpu().numpy()}, filepath)
148
149  def load(self, filepath):
150    """Load the model from a file.
151
152    Args:
153      filepath: the path of the file.
154    """
155    var_dict = torch.load(filepath)
156    self.rnn_model.load_state_dict(var_dict['rnn_state_dict'])
157    self.rnn_init_hidden = nn.Parameter(
158        torch.from_numpy(var_dict['rnn_init_hidden']).to(self.device))
159    self.transition_bias = float(var_dict['transition_bias'])
160    self.transition_bias_denominator = float(
161        var_dict['transition_bias_denominator'])
162    self.crp_alpha = float(var_dict['crp_alpha'])
163    self.sigma2 = nn.Parameter(
164        torch.from_numpy(var_dict['sigma2']).to(self.device))
165
166    self.logger.print(
167        3, 'Loaded model with transition_bias={}, crp_alpha={}, sigma2={}, '
168        'rnn_init_hidden={}'.format(
169            self.transition_bias, self.crp_alpha, var_dict['sigma2'],
170            var_dict['rnn_init_hidden']))
171
172  def fit_concatenated(self, train_sequence, train_cluster_id, args):
173    """Fit UISRNN model to concatenated sequence and cluster_id.
174
175    Args:
176      train_sequence: the training observation sequence, which is a
177        2-dim numpy array of real numbers, of size `N * D`.
178
179        - `N`: summation of lengths of all utterances.
180        - `D`: observation dimension.
181
182        For example,
183      ```
184      train_sequence =
185      [[1.2 3.0 -4.1 6.0]    --> an entry of speaker #0 from utterance 'iaaa'
186       [0.8 -1.1 0.4 0.5]    --> an entry of speaker #1 from utterance 'iaaa'
187       [-0.2 1.0 3.8 5.7]    --> an entry of speaker #0 from utterance 'iaaa'
188       [3.8 -0.1 1.5 2.3]    --> an entry of speaker #0 from utterance 'ibbb'
189       [1.2 1.4 3.6 -2.7]]   --> an entry of speaker #0 from utterance 'ibbb'
190      ```
191        Here `N=5`, `D=4`.
192
193        We concatenate all training utterances into this single sequence.
194      train_cluster_id: the speaker id sequence, which is 1-dim list or
195        numpy array of strings, of size `N`.
196        For example,
197      ```
198      train_cluster_id =
199        ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0']
200      ```
201        'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'.
202
203        Note that the order of entries within an utterance are preserved,
204        and all utterances are simply concatenated together.
205      args: Training configurations. See `arguments.py` for details.
206
207    Raises:
208      TypeError: If train_sequence or train_cluster_id is of wrong type.
209      ValueError: If train_sequence or train_cluster_id has wrong dimension.
210    """
211    # check type
212    if (not isinstance(train_sequence, np.ndarray) or
213        train_sequence.dtype != float):
214      raise TypeError('train_sequence should be a numpy array of float type.')
215    if isinstance(train_cluster_id, list):
216      train_cluster_id = np.array(train_cluster_id)
217    if (not isinstance(train_cluster_id, np.ndarray) or
218        not train_cluster_id.dtype.name.startswith(('str', 'unicode'))):
219      raise TypeError('train_cluster_id type be a numpy array of strings.')
220    # check dimension
221    if train_sequence.ndim != 2:
222      raise ValueError('train_sequence must be 2-dim array.')
223    if train_cluster_id.ndim != 1:
224      raise ValueError('train_cluster_id must be 1-dim array.')
225    # check length and size
226    train_total_length, observation_dim = train_sequence.shape
227    if observation_dim != self.observation_dim:
228      raise ValueError('train_sequence does not match the dimension specified '
229                       'by args.observation_dim.')
230    if train_total_length != len(train_cluster_id):
231      raise ValueError('train_sequence length is not equal to '
232                       'train_cluster_id length.')
233
234    self.rnn_model.train()
235    optimizer = self._get_optimizer(optimizer=args.optimizer,
236                                    learning_rate=args.learning_rate)
237
238    sub_sequences, seq_lengths = utils.resize_sequence(
239        sequence=train_sequence,
240        cluster_id=train_cluster_id,
241        num_permutations=args.num_permutations)
242
243    # For batch learning, pack the entire dataset.
244    if args.batch_size is None:
245      packed_train_sequence, rnn_truth = utils.pack_sequence(
246          sub_sequences,
247          seq_lengths,
248          args.batch_size,
249          self.observation_dim,
250          self.device)
251    train_loss = []
252    for num_iter in range(args.train_iteration):
253      optimizer.zero_grad()
254      # For online learning, pack a subset in each iteration.
255      if args.batch_size is not None:
256        packed_train_sequence, rnn_truth = utils.pack_sequence(
257            sub_sequences,
258            seq_lengths,
259            args.batch_size,
260            self.observation_dim,
261            self.device)
262      hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1)
263      mean, _ = self.rnn_model(packed_train_sequence, hidden)
264      # use mean to predict
265      mean = torch.cumsum(mean, dim=0)
266      mean_size = mean.size()
267      mean = torch.mm(
268          torch.diag(
269              1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)),
270          mean.view(mean_size[0], -1))
271      mean = mean.view(mean_size)
272
273      # Likelihood part.
274      loss1 = loss_func.weighted_mse_loss(
275          input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :],
276          target_tensor=rnn_truth,
277          weight=1 / (2 * self.sigma2))
278
279      # Sigma2 prior part.
280      weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth)
281                ** 2).view(-1, observation_dim)
282      num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze()
283      loss2 = loss_func.sigma2_prior_loss(
284          num_non_zero, args.sigma_alpha, args.sigma_beta, self.sigma2)
285
286      # Regularization part.
287      loss3 = loss_func.regularization_loss(
288          self.rnn_model.parameters(), args.regularization_weight)
289
290      loss = loss1 + loss2 + loss3
291      loss.backward()
292      nn.utils.clip_grad_norm_(self.rnn_model.parameters(), args.grad_max_norm)
293      optimizer.step()
294      # avoid numerical issues
295      self.sigma2.data.clamp_(min=1e-6)
296
297      if (np.remainder(num_iter, 10) == 0 or
298          num_iter == args.train_iteration - 1):
299        self.logger.print(
300            2,
301            'Iter: {:d}  \t'
302            'Training Loss: {:.4f}    \n'
303            '    Negative Log Likelihood: {:.4f}\t'
304            'Sigma2 Prior: {:.4f}\t'
305            'Regularization: {:.4f}'.format(
306                num_iter,
307                float(loss.data),
308                float(loss1.data),
309                float(loss2.data),
310                float(loss3.data)))
311      train_loss.append(float(loss1.data))  # only save the likelihood part
312    self.logger.print(
313        1, 'Done training with {} iterations'.format(args.train_iteration))
314
315  def fit(self, train_sequences, train_cluster_ids, args):
316    """Fit UISRNN model.
317
318    Args:
319      train_sequences: Either a list of training sequences, or a single
320        concatenated training sequence:
321
322        1. train_sequences is list, and each element is a 2-dim numpy array
323           of real numbers, of size: `length * D`.
324           The length varies among different sequences, but the D is the same.
325           In speaker diarization, each sequence is the sequence of speaker
326           embeddings of one utterance.
327        2. train_sequences is a single concatenated sequence, which is a
328           2-dim numpy array of real numbers. See `fit_concatenated()`
329           for more details.
330      train_cluster_ids: Ground truth labels for train_sequences:
331
332        1. if train_sequences is a list, this must also be a list of the same
333           size, each element being a 1-dim list or numpy array of strings.
334        2. if train_sequences is a single concatenated sequence, this
335           must also be the concatenated 1-dim list or numpy array of strings
336      args: Training configurations. See `arguments.py` for details.
337
338    Raises:
339      TypeError: If train_sequences or train_cluster_ids is of wrong type.
340    """
341    if isinstance(train_sequences, np.ndarray):
342      # train_sequences is already the concatenated sequence
343      if self.estimate_transition_bias:
344        # see issue #55: https://github.com/google/uis-rnn/issues/55
345        self.logger.print(
346            2,
347            'Warning: transition_bias cannot be correctly estimated from a '
348            'concatenated sequence; train_sequences will be treated as a '
349            'single sequence. This can lead to inaccurate estimation of '
350            'transition_bias. Please, consider estimating transition_bias '
351            'before concatenating the sequences and passing it as argument.')
352      train_sequences = [train_sequences]
353      train_cluster_ids = [train_cluster_ids]
354    elif isinstance(train_sequences, list):
355      # train_sequences is a list of un-concatenated sequences
356      # we will concatenate it later, after estimating transition_bias
357      pass
358    else:
359      raise TypeError('train_sequences must be a list or numpy.ndarray')
360
361    # estimate transition_bias
362    if self.estimate_transition_bias:
363      (transition_bias,
364       transition_bias_denominator) = utils.estimate_transition_bias(
365           train_cluster_ids)
366      # set or update transition_bias
367      if self.transition_bias is None:
368        self.transition_bias = transition_bias
369        self.transition_bias_denominator = transition_bias_denominator
370      else:
371        self.transition_bias = (
372            self.transition_bias * self.transition_bias_denominator +
373            transition_bias * transition_bias_denominator) / (
374                self.transition_bias_denominator + transition_bias_denominator)
375        self.transition_bias_denominator += transition_bias_denominator
376
377    # concatenate train_sequences
378    (concatenated_train_sequence,
379     concatenated_train_cluster_id) = utils.concatenate_training_data(
380         train_sequences,
381         train_cluster_ids,
382         args.enforce_cluster_id_uniqueness,
383         True)
384
385    self.fit_concatenated(
386        concatenated_train_sequence, concatenated_train_cluster_id, args)
387
388  def _update_beam_state(self, beam_state, look_ahead_seq, cluster_seq):
389    """Update a beam state given a look ahead sequence and known cluster
390    assignments.
391
392    Args:
393      beam_state: A BeamState object.
394      look_ahead_seq: Look ahead sequence, size: look_ahead*D.
395        look_ahead: number of step to look ahead in the beam search.
396        D: observation dimension
397      cluster_seq: Cluster assignment sequence for look_ahead_seq.
398
399    Returns:
400      new_beam_state: An updated BeamState object.
401    """
402
403    loss = 0
404    new_beam_state = BeamState(beam_state)
405    for sub_idx, cluster in enumerate(cluster_seq):
406      if cluster > len(new_beam_state.mean_set):  # invalid trace
407        new_beam_state.neg_likelihood = float('inf')
408        break
409      elif cluster < len(new_beam_state.mean_set):  # existing cluster
410        last_cluster = new_beam_state.trace[-1]
411        loss = loss_func.weighted_mse_loss(
412            input_tensor=torch.squeeze(new_beam_state.mean_set[cluster]),
413            target_tensor=look_ahead_seq[sub_idx, :],
414            weight=1 / (2 * self.sigma2)).cpu().detach().numpy()
415        if cluster == last_cluster:
416          loss -= np.log(1 - self.transition_bias)
417        else:
418          loss -= np.log(self.transition_bias) + np.log(
419              new_beam_state.block_counts[cluster]) - np.log(
420                  sum(new_beam_state.block_counts) + self.crp_alpha)
421        # update new mean and new hidden
422        mean, hidden = self.rnn_model(
423            look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0),
424            new_beam_state.hidden_set[cluster])
425        new_beam_state.mean_set[cluster] = (new_beam_state.mean_set[cluster]*(
426            (np.array(new_beam_state.trace) == cluster).sum() -
427            1).astype(float) + mean.clone()) / (
428                np.array(new_beam_state.trace) == cluster).sum().astype(
429                    float)  # use mean to predict
430        new_beam_state.hidden_set[cluster] = hidden.clone()
431        if cluster != last_cluster:
432          new_beam_state.block_counts[cluster] += 1
433        new_beam_state.trace.append(cluster)
434      else:  # new cluster
435        init_input = autograd.Variable(
436            torch.zeros(self.observation_dim)
437        ).unsqueeze(0).unsqueeze(0).to(self.device)
438        mean, hidden = self.rnn_model(init_input,
439                                      self.rnn_init_hidden)
440        loss = loss_func.weighted_mse_loss(
441            input_tensor=torch.squeeze(mean),
442            target_tensor=look_ahead_seq[sub_idx, :],
443            weight=1 / (2 * self.sigma2)).cpu().detach().numpy()
444        loss -= np.log(self.transition_bias) + np.log(
445            self.crp_alpha) - np.log(
446                sum(new_beam_state.block_counts) + self.crp_alpha)
447        # update new min and new hidden
448        mean, hidden = self.rnn_model(
449            look_ahead_seq[sub_idx, :].unsqueeze(0).unsqueeze(0),
450            hidden)
451        new_beam_state.append(mean, hidden, cluster)
452      new_beam_state.neg_likelihood += loss
453    return new_beam_state
454
455  def _calculate_score(self, beam_state, look_ahead_seq):
456    """Calculate negative log likelihoods for all possible state allocations
457       of a look ahead sequence, according to the current beam state.
458
459    Args:
460      beam_state: A BeamState object.
461      look_ahead_seq: Look ahead sequence, size: look_ahead*D.
462        look_ahead: number of step to look ahead in the beam search.
463        D: observation dimension
464
465    Returns:
466      beam_score_set: a set of scores for each possible state allocation.
467    """
468
469    look_ahead, _ = look_ahead_seq.shape
470    beam_num_clusters = len(beam_state.mean_set)
471    beam_score_set = float('inf') * np.ones(
472        beam_num_clusters + 1 + np.arange(look_ahead))
473    for cluster_seq, _ in np.ndenumerate(beam_score_set):
474      updated_beam_state = self._update_beam_state(beam_state,
475                                                   look_ahead_seq, cluster_seq)
476      beam_score_set[cluster_seq] = updated_beam_state.neg_likelihood
477    return beam_score_set
478
479  def predict_single(self, test_sequence, args):
480    """Predict labels for a single test sequence using UISRNN model.
481
482    Args:
483      test_sequence: the test observation sequence, which is 2-dim numpy array
484        of real numbers, of size `N * D`.
485
486        - `N`: length of one test utterance.
487        - `D` : observation dimension.
488
489        For example:
490      ```
491      test_sequence =
492      [[2.2 -1.0 3.0 5.6]    --> 1st entry of utterance 'iccc'
493       [0.5 1.8 -3.2 0.4]    --> 2nd entry of utterance 'iccc'
494       [-2.2 5.0 1.8 3.7]    --> 3rd entry of utterance 'iccc'
495       [-3.8 0.1 1.4 3.3]    --> 4th entry of utterance 'iccc'
496       [0.1 2.7 3.5 -1.7]]   --> 5th entry of utterance 'iccc'
497      ```
498        Here `N=5`, `D=4`.
499      args: Inference configurations. See `arguments.py` for details.
500
501    Returns:
502      predicted_cluster_id: predicted speaker id sequence, which is
503        an array of integers, of size `N`.
504        For example, `predicted_cluster_id = [0, 1, 0, 0, 1]`
505
506    Raises:
507      TypeError: If test_sequence is of wrong type.
508      ValueError: If test_sequence has wrong dimension.
509    """
510    # check type
511    if (not isinstance(test_sequence, np.ndarray) or
512        test_sequence.dtype != float):
513      raise TypeError('test_sequence should be a numpy array of float type.')
514    # check dimension
515    if test_sequence.ndim != 2:
516      raise ValueError('test_sequence must be 2-dim array.')
517    # check size
518    test_sequence_length, observation_dim = test_sequence.shape
519    if observation_dim != self.observation_dim:
520      raise ValueError('test_sequence does not match the dimension specified '
521                       'by args.observation_dim.')
522
523    self.rnn_model.eval()
524    test_sequence = np.tile(test_sequence, (args.test_iteration, 1))
525    test_sequence = autograd.Variable(
526        torch.from_numpy(test_sequence).float()).to(self.device)
527    # bookkeeping for beam search
528    beam_set = [BeamState()]
529    for num_iter in np.arange(0, args.test_iteration * test_sequence_length,
530                              args.look_ahead):
531      max_clusters = max([len(beam_state.mean_set) for beam_state in beam_set])
532      look_ahead_seq = test_sequence[num_iter:  num_iter + args.look_ahead, :]
533      look_ahead_seq_length = look_ahead_seq.shape[0]
534      score_set = float('inf') * np.ones(
535          np.append(
536              args.beam_size, max_clusters + 1 + np.arange(
537                  look_ahead_seq_length)))
538      for beam_rank, beam_state in enumerate(beam_set):
539        beam_score_set = self._calculate_score(beam_state, look_ahead_seq)
540        score_set[beam_rank, :] = np.pad(
541            beam_score_set,
542            np.tile([[0, max_clusters - len(beam_state.mean_set)]],
543                    (look_ahead_seq_length, 1)), 'constant',
544            constant_values=float('inf'))
545      # find top scores
546      score_ranked = np.sort(score_set, axis=None)
547      score_ranked[score_ranked == float('inf')] = 0
548      score_ranked = np.trim_zeros(score_ranked)
549      idx_ranked = np.argsort(score_set, axis=None)
550      updated_beam_set = []
551      for new_beam_rank in range(
552          np.min((len(score_ranked), args.beam_size))):
553        total_idx = np.unravel_index(idx_ranked[new_beam_rank],
554                                     score_set.shape)
555        prev_beam_rank = total_idx[0].item()
556        cluster_seq = total_idx[1:]
557        updated_beam_state = self._update_beam_state(
558            beam_set[prev_beam_rank], look_ahead_seq, cluster_seq)
559        updated_beam_set.append(updated_beam_state)
560      beam_set = updated_beam_set
561    predicted_cluster_id = beam_set[0].trace[-test_sequence_length:]
562    return predicted_cluster_id
563
564  def predict(self, test_sequences, args):
565    """Predict labels for a single or many test sequences using UISRNN model.
566
567    Args:
568      test_sequences: Either a list of test sequences, or a single test
569        sequence. Each test sequence is a 2-dim numpy array
570        of real numbers. See `predict_single()` for details.
571      args: Inference configurations. See `arguments.py` for details.
572
573    Returns:
574      predicted_cluster_ids: Predicted labels for test_sequences.
575
576        1. if test_sequences is a list, predicted_cluster_ids will be a list
577           of the same size, where each element being a 1-dim list of strings.
578        2. if test_sequences is a single sequence, predicted_cluster_ids will
579           be a 1-dim list of strings
580
581    Raises:
582      TypeError: If test_sequences is of wrong type.
583    """
584    # check type
585    if isinstance(test_sequences, np.ndarray):
586      return self.predict_single(test_sequences, args)
587    if isinstance(test_sequences, list):
588      return [self.predict_single(test_sequence, args)
589              for test_sequence in test_sequences]
590    raise TypeError('test_sequences should be either a list or numpy array.')

Unbounded Interleaved-State Recurrent Neural Networks.

UISRNN(args)
 83  def __init__(self, args):
 84    """Construct the UISRNN object.
 85
 86    Args:
 87      args: Model configurations. See `arguments.py` for details.
 88    """
 89    self.observation_dim = args.observation_dim
 90    self.device = torch.device(
 91        'cuda:0' if (torch.cuda.is_available() and args.enable_cuda) else 'cpu')
 92    self.rnn_model = CoreRNN(self.observation_dim, args.rnn_hidden_size,
 93                             args.rnn_depth, self.observation_dim,
 94                             args.rnn_dropout).to(self.device)
 95    self.rnn_init_hidden = nn.Parameter(
 96        torch.zeros(args.rnn_depth, 1, args.rnn_hidden_size).to(self.device))
 97    # booleans indicating which variables are trainable
 98    self.estimate_sigma2 = (args.sigma2 is None)
 99    self.estimate_transition_bias = (args.transition_bias is None)
100    # initial values of variables
101    sigma2 = _INITIAL_SIGMA2_VALUE if self.estimate_sigma2 else args.sigma2
102    self.sigma2 = nn.Parameter(
103        sigma2 * torch.ones(self.observation_dim).to(self.device))
104    self.transition_bias = args.transition_bias
105    self.transition_bias_denominator = 0.0
106    self.crp_alpha = args.crp_alpha
107    self.logger = utils.Logger(args.verbosity)

Construct the UISRNN object.

Args: args: Model configurations. See arguments.py for details.

observation_dim
device
rnn_model
rnn_init_hidden
estimate_sigma2
estimate_transition_bias
sigma2
transition_bias
transition_bias_denominator
crp_alpha
logger
def save(self, filepath):
135  def save(self, filepath):
136    """Save the model to a file.
137
138    Args:
139      filepath: the path of the file.
140    """
141    torch.save({
142        'rnn_state_dict': self.rnn_model.state_dict(),
143        'rnn_init_hidden': self.rnn_init_hidden.detach().cpu().numpy(),
144        'transition_bias': self.transition_bias,
145        'transition_bias_denominator': self.transition_bias_denominator,
146        'crp_alpha': self.crp_alpha,
147        'sigma2': self.sigma2.detach().cpu().numpy()}, filepath)

Save the model to a file.

Args: filepath: the path of the file.

def load(self, filepath):
149  def load(self, filepath):
150    """Load the model from a file.
151
152    Args:
153      filepath: the path of the file.
154    """
155    var_dict = torch.load(filepath)
156    self.rnn_model.load_state_dict(var_dict['rnn_state_dict'])
157    self.rnn_init_hidden = nn.Parameter(
158        torch.from_numpy(var_dict['rnn_init_hidden']).to(self.device))
159    self.transition_bias = float(var_dict['transition_bias'])
160    self.transition_bias_denominator = float(
161        var_dict['transition_bias_denominator'])
162    self.crp_alpha = float(var_dict['crp_alpha'])
163    self.sigma2 = nn.Parameter(
164        torch.from_numpy(var_dict['sigma2']).to(self.device))
165
166    self.logger.print(
167        3, 'Loaded model with transition_bias={}, crp_alpha={}, sigma2={}, '
168        'rnn_init_hidden={}'.format(
169            self.transition_bias, self.crp_alpha, var_dict['sigma2'],
170            var_dict['rnn_init_hidden']))

Load the model from a file.

Args: filepath: the path of the file.

def fit_concatenated(self, train_sequence, train_cluster_id, args):
172  def fit_concatenated(self, train_sequence, train_cluster_id, args):
173    """Fit UISRNN model to concatenated sequence and cluster_id.
174
175    Args:
176      train_sequence: the training observation sequence, which is a
177        2-dim numpy array of real numbers, of size `N * D`.
178
179        - `N`: summation of lengths of all utterances.
180        - `D`: observation dimension.
181
182        For example,
183      ```
184      train_sequence =
185      [[1.2 3.0 -4.1 6.0]    --> an entry of speaker #0 from utterance 'iaaa'
186       [0.8 -1.1 0.4 0.5]    --> an entry of speaker #1 from utterance 'iaaa'
187       [-0.2 1.0 3.8 5.7]    --> an entry of speaker #0 from utterance 'iaaa'
188       [3.8 -0.1 1.5 2.3]    --> an entry of speaker #0 from utterance 'ibbb'
189       [1.2 1.4 3.6 -2.7]]   --> an entry of speaker #0 from utterance 'ibbb'
190      ```
191        Here `N=5`, `D=4`.
192
193        We concatenate all training utterances into this single sequence.
194      train_cluster_id: the speaker id sequence, which is 1-dim list or
195        numpy array of strings, of size `N`.
196        For example,
197      ```
198      train_cluster_id =
199        ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0']
200      ```
201        'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'.
202
203        Note that the order of entries within an utterance are preserved,
204        and all utterances are simply concatenated together.
205      args: Training configurations. See `arguments.py` for details.
206
207    Raises:
208      TypeError: If train_sequence or train_cluster_id is of wrong type.
209      ValueError: If train_sequence or train_cluster_id has wrong dimension.
210    """
211    # check type
212    if (not isinstance(train_sequence, np.ndarray) or
213        train_sequence.dtype != float):
214      raise TypeError('train_sequence should be a numpy array of float type.')
215    if isinstance(train_cluster_id, list):
216      train_cluster_id = np.array(train_cluster_id)
217    if (not isinstance(train_cluster_id, np.ndarray) or
218        not train_cluster_id.dtype.name.startswith(('str', 'unicode'))):
219      raise TypeError('train_cluster_id type be a numpy array of strings.')
220    # check dimension
221    if train_sequence.ndim != 2:
222      raise ValueError('train_sequence must be 2-dim array.')
223    if train_cluster_id.ndim != 1:
224      raise ValueError('train_cluster_id must be 1-dim array.')
225    # check length and size
226    train_total_length, observation_dim = train_sequence.shape
227    if observation_dim != self.observation_dim:
228      raise ValueError('train_sequence does not match the dimension specified '
229                       'by args.observation_dim.')
230    if train_total_length != len(train_cluster_id):
231      raise ValueError('train_sequence length is not equal to '
232                       'train_cluster_id length.')
233
234    self.rnn_model.train()
235    optimizer = self._get_optimizer(optimizer=args.optimizer,
236                                    learning_rate=args.learning_rate)
237
238    sub_sequences, seq_lengths = utils.resize_sequence(
239        sequence=train_sequence,
240        cluster_id=train_cluster_id,
241        num_permutations=args.num_permutations)
242
243    # For batch learning, pack the entire dataset.
244    if args.batch_size is None:
245      packed_train_sequence, rnn_truth = utils.pack_sequence(
246          sub_sequences,
247          seq_lengths,
248          args.batch_size,
249          self.observation_dim,
250          self.device)
251    train_loss = []
252    for num_iter in range(args.train_iteration):
253      optimizer.zero_grad()
254      # For online learning, pack a subset in each iteration.
255      if args.batch_size is not None:
256        packed_train_sequence, rnn_truth = utils.pack_sequence(
257            sub_sequences,
258            seq_lengths,
259            args.batch_size,
260            self.observation_dim,
261            self.device)
262      hidden = self.rnn_init_hidden.repeat(1, args.batch_size, 1)
263      mean, _ = self.rnn_model(packed_train_sequence, hidden)
264      # use mean to predict
265      mean = torch.cumsum(mean, dim=0)
266      mean_size = mean.size()
267      mean = torch.mm(
268          torch.diag(
269              1.0 / torch.arange(1, mean_size[0] + 1).float().to(self.device)),
270          mean.view(mean_size[0], -1))
271      mean = mean.view(mean_size)
272
273      # Likelihood part.
274      loss1 = loss_func.weighted_mse_loss(
275          input_tensor=(rnn_truth != 0).float() * mean[:-1, :, :],
276          target_tensor=rnn_truth,
277          weight=1 / (2 * self.sigma2))
278
279      # Sigma2 prior part.
280      weight = (((rnn_truth != 0).float() * mean[:-1, :, :] - rnn_truth)
281                ** 2).view(-1, observation_dim)
282      num_non_zero = torch.sum((weight != 0).float(), dim=0).squeeze()
283      loss2 = loss_func.sigma2_prior_loss(
284          num_non_zero, args.sigma_alpha, args.sigma_beta, self.sigma2)
285
286      # Regularization part.
287      loss3 = loss_func.regularization_loss(
288          self.rnn_model.parameters(), args.regularization_weight)
289
290      loss = loss1 + loss2 + loss3
291      loss.backward()
292      nn.utils.clip_grad_norm_(self.rnn_model.parameters(), args.grad_max_norm)
293      optimizer.step()
294      # avoid numerical issues
295      self.sigma2.data.clamp_(min=1e-6)
296
297      if (np.remainder(num_iter, 10) == 0 or
298          num_iter == args.train_iteration - 1):
299        self.logger.print(
300            2,
301            'Iter: {:d}  \t'
302            'Training Loss: {:.4f}    \n'
303            '    Negative Log Likelihood: {:.4f}\t'
304            'Sigma2 Prior: {:.4f}\t'
305            'Regularization: {:.4f}'.format(
306                num_iter,
307                float(loss.data),
308                float(loss1.data),
309                float(loss2.data),
310                float(loss3.data)))
311      train_loss.append(float(loss1.data))  # only save the likelihood part
312    self.logger.print(
313        1, 'Done training with {} iterations'.format(args.train_iteration))

Fit UISRNN model to concatenated sequence and cluster_id.

Args: train_sequence: the training observation sequence, which is a 2-dim numpy array of real numbers, of size N * D.

- `N`: summation of lengths of all utterances.
- `D`: observation dimension.

For example,

train_sequence =
[[1.2 3.0 -4.1 6.0]    --> an entry of speaker #0 from utterance 'iaaa'
 [0.8 -1.1 0.4 0.5]    --> an entry of speaker #1 from utterance 'iaaa'
 [-0.2 1.0 3.8 5.7]    --> an entry of speaker #0 from utterance 'iaaa'
 [3.8 -0.1 1.5 2.3]    --> an entry of speaker #0 from utterance 'ibbb'
 [1.2 1.4 3.6 -2.7]]   --> an entry of speaker #0 from utterance 'ibbb'

Here `N=5`, `D=4`.
We concatenate all training utterances into this single sequence.

train_cluster_id: the speaker id sequence, which is 1-dim list or numpy array of strings, of size N. For example,
train_cluster_id =
  ['iaaa_0', 'iaaa_1', 'iaaa_0', 'ibbb_0', 'ibbb_0']
'iaaa_0' means the entry belongs to speaker #0 in utterance 'iaaa'.

Note that the order of entries within an utterance are preserved,
and all utterances are simply concatenated together.

args: Training configurations. See arguments.py for details.

Raises: TypeError: If train_sequence or train_cluster_id is of wrong type. ValueError: If train_sequence or train_cluster_id has wrong dimension.

def fit(self, train_sequences, train_cluster_ids, args):
315  def fit(self, train_sequences, train_cluster_ids, args):
316    """Fit UISRNN model.
317
318    Args:
319      train_sequences: Either a list of training sequences, or a single
320        concatenated training sequence:
321
322        1. train_sequences is list, and each element is a 2-dim numpy array
323           of real numbers, of size: `length * D`.
324           The length varies among different sequences, but the D is the same.
325           In speaker diarization, each sequence is the sequence of speaker
326           embeddings of one utterance.
327        2. train_sequences is a single concatenated sequence, which is a
328           2-dim numpy array of real numbers. See `fit_concatenated()`
329           for more details.
330      train_cluster_ids: Ground truth labels for train_sequences:
331
332        1. if train_sequences is a list, this must also be a list of the same
333           size, each element being a 1-dim list or numpy array of strings.
334        2. if train_sequences is a single concatenated sequence, this
335           must also be the concatenated 1-dim list or numpy array of strings
336      args: Training configurations. See `arguments.py` for details.
337
338    Raises:
339      TypeError: If train_sequences or train_cluster_ids is of wrong type.
340    """
341    if isinstance(train_sequences, np.ndarray):
342      # train_sequences is already the concatenated sequence
343      if self.estimate_transition_bias:
344        # see issue #55: https://github.com/google/uis-rnn/issues/55
345        self.logger.print(
346            2,
347            'Warning: transition_bias cannot be correctly estimated from a '
348            'concatenated sequence; train_sequences will be treated as a '
349            'single sequence. This can lead to inaccurate estimation of '
350            'transition_bias. Please, consider estimating transition_bias '
351            'before concatenating the sequences and passing it as argument.')
352      train_sequences = [train_sequences]
353      train_cluster_ids = [train_cluster_ids]
354    elif isinstance(train_sequences, list):
355      # train_sequences is a list of un-concatenated sequences
356      # we will concatenate it later, after estimating transition_bias
357      pass
358    else:
359      raise TypeError('train_sequences must be a list or numpy.ndarray')
360
361    # estimate transition_bias
362    if self.estimate_transition_bias:
363      (transition_bias,
364       transition_bias_denominator) = utils.estimate_transition_bias(
365           train_cluster_ids)
366      # set or update transition_bias
367      if self.transition_bias is None:
368        self.transition_bias = transition_bias
369        self.transition_bias_denominator = transition_bias_denominator
370      else:
371        self.transition_bias = (
372            self.transition_bias * self.transition_bias_denominator +
373            transition_bias * transition_bias_denominator) / (
374                self.transition_bias_denominator + transition_bias_denominator)
375        self.transition_bias_denominator += transition_bias_denominator
376
377    # concatenate train_sequences
378    (concatenated_train_sequence,
379     concatenated_train_cluster_id) = utils.concatenate_training_data(
380         train_sequences,
381         train_cluster_ids,
382         args.enforce_cluster_id_uniqueness,
383         True)
384
385    self.fit_concatenated(
386        concatenated_train_sequence, concatenated_train_cluster_id, args)

Fit UISRNN model.

Args: train_sequences: Either a list of training sequences, or a single concatenated training sequence:

1. train_sequences is list, and each element is a 2-dim numpy array
   of real numbers, of size: `length * D`.
   The length varies among different sequences, but the D is the same.
   In speaker diarization, each sequence is the sequence of speaker
   embeddings of one utterance.
2. train_sequences is a single concatenated sequence, which is a
   2-dim numpy array of real numbers. See `fit_concatenated()`
   for more details.

train_cluster_ids: Ground truth labels for train_sequences:

1. if train_sequences is a list, this must also be a list of the same
   size, each element being a 1-dim list or numpy array of strings.
2. if train_sequences is a single concatenated sequence, this
   must also be the concatenated 1-dim list or numpy array of strings

args: Training configurations. See arguments.py for details.

Raises: TypeError: If train_sequences or train_cluster_ids is of wrong type.

def predict_single(self, test_sequence, args):
479  def predict_single(self, test_sequence, args):
480    """Predict labels for a single test sequence using UISRNN model.
481
482    Args:
483      test_sequence: the test observation sequence, which is 2-dim numpy array
484        of real numbers, of size `N * D`.
485
486        - `N`: length of one test utterance.
487        - `D` : observation dimension.
488
489        For example:
490      ```
491      test_sequence =
492      [[2.2 -1.0 3.0 5.6]    --> 1st entry of utterance 'iccc'
493       [0.5 1.8 -3.2 0.4]    --> 2nd entry of utterance 'iccc'
494       [-2.2 5.0 1.8 3.7]    --> 3rd entry of utterance 'iccc'
495       [-3.8 0.1 1.4 3.3]    --> 4th entry of utterance 'iccc'
496       [0.1 2.7 3.5 -1.7]]   --> 5th entry of utterance 'iccc'
497      ```
498        Here `N=5`, `D=4`.
499      args: Inference configurations. See `arguments.py` for details.
500
501    Returns:
502      predicted_cluster_id: predicted speaker id sequence, which is
503        an array of integers, of size `N`.
504        For example, `predicted_cluster_id = [0, 1, 0, 0, 1]`
505
506    Raises:
507      TypeError: If test_sequence is of wrong type.
508      ValueError: If test_sequence has wrong dimension.
509    """
510    # check type
511    if (not isinstance(test_sequence, np.ndarray) or
512        test_sequence.dtype != float):
513      raise TypeError('test_sequence should be a numpy array of float type.')
514    # check dimension
515    if test_sequence.ndim != 2:
516      raise ValueError('test_sequence must be 2-dim array.')
517    # check size
518    test_sequence_length, observation_dim = test_sequence.shape
519    if observation_dim != self.observation_dim:
520      raise ValueError('test_sequence does not match the dimension specified '
521                       'by args.observation_dim.')
522
523    self.rnn_model.eval()
524    test_sequence = np.tile(test_sequence, (args.test_iteration, 1))
525    test_sequence = autograd.Variable(
526        torch.from_numpy(test_sequence).float()).to(self.device)
527    # bookkeeping for beam search
528    beam_set = [BeamState()]
529    for num_iter in np.arange(0, args.test_iteration * test_sequence_length,
530                              args.look_ahead):
531      max_clusters = max([len(beam_state.mean_set) for beam_state in beam_set])
532      look_ahead_seq = test_sequence[num_iter:  num_iter + args.look_ahead, :]
533      look_ahead_seq_length = look_ahead_seq.shape[0]
534      score_set = float('inf') * np.ones(
535          np.append(
536              args.beam_size, max_clusters + 1 + np.arange(
537                  look_ahead_seq_length)))
538      for beam_rank, beam_state in enumerate(beam_set):
539        beam_score_set = self._calculate_score(beam_state, look_ahead_seq)
540        score_set[beam_rank, :] = np.pad(
541            beam_score_set,
542            np.tile([[0, max_clusters - len(beam_state.mean_set)]],
543                    (look_ahead_seq_length, 1)), 'constant',
544            constant_values=float('inf'))
545      # find top scores
546      score_ranked = np.sort(score_set, axis=None)
547      score_ranked[score_ranked == float('inf')] = 0
548      score_ranked = np.trim_zeros(score_ranked)
549      idx_ranked = np.argsort(score_set, axis=None)
550      updated_beam_set = []
551      for new_beam_rank in range(
552          np.min((len(score_ranked), args.beam_size))):
553        total_idx = np.unravel_index(idx_ranked[new_beam_rank],
554                                     score_set.shape)
555        prev_beam_rank = total_idx[0].item()
556        cluster_seq = total_idx[1:]
557        updated_beam_state = self._update_beam_state(
558            beam_set[prev_beam_rank], look_ahead_seq, cluster_seq)
559        updated_beam_set.append(updated_beam_state)
560      beam_set = updated_beam_set
561    predicted_cluster_id = beam_set[0].trace[-test_sequence_length:]
562    return predicted_cluster_id

Predict labels for a single test sequence using UISRNN model.

Args: test_sequence: the test observation sequence, which is 2-dim numpy array of real numbers, of size N * D.

- `N`: length of one test utterance.
- `D` : observation dimension.

For example:

test_sequence =
[[2.2 -1.0 3.0 5.6]    --> 1st entry of utterance 'iccc'
 [0.5 1.8 -3.2 0.4]    --> 2nd entry of utterance 'iccc'
 [-2.2 5.0 1.8 3.7]    --> 3rd entry of utterance 'iccc'
 [-3.8 0.1 1.4 3.3]    --> 4th entry of utterance 'iccc'
 [0.1 2.7 3.5 -1.7]]   --> 5th entry of utterance 'iccc'
Here N=5, D=4. args: Inference configurations. See arguments.py for details.

Returns: predicted_cluster_id: predicted speaker id sequence, which is an array of integers, of size N. For example, predicted_cluster_id = [0, 1, 0, 0, 1]

Raises: TypeError: If test_sequence is of wrong type. ValueError: If test_sequence has wrong dimension.

def predict(self, test_sequences, args):
564  def predict(self, test_sequences, args):
565    """Predict labels for a single or many test sequences using UISRNN model.
566
567    Args:
568      test_sequences: Either a list of test sequences, or a single test
569        sequence. Each test sequence is a 2-dim numpy array
570        of real numbers. See `predict_single()` for details.
571      args: Inference configurations. See `arguments.py` for details.
572
573    Returns:
574      predicted_cluster_ids: Predicted labels for test_sequences.
575
576        1. if test_sequences is a list, predicted_cluster_ids will be a list
577           of the same size, where each element being a 1-dim list of strings.
578        2. if test_sequences is a single sequence, predicted_cluster_ids will
579           be a 1-dim list of strings
580
581    Raises:
582      TypeError: If test_sequences is of wrong type.
583    """
584    # check type
585    if isinstance(test_sequences, np.ndarray):
586      return self.predict_single(test_sequences, args)
587    if isinstance(test_sequences, list):
588      return [self.predict_single(test_sequence, args)
589              for test_sequence in test_sequences]
590    raise TypeError('test_sequences should be either a list or numpy array.')

Predict labels for a single or many test sequences using UISRNN model.

Args: test_sequences: Either a list of test sequences, or a single test sequence. Each test sequence is a 2-dim numpy array of real numbers. See predict_single() for details. args: Inference configurations. See arguments.py for details.

Returns: predicted_cluster_ids: Predicted labels for test_sequences.

1. if test_sequences is a list, predicted_cluster_ids will be a list
   of the same size, where each element being a 1-dim list of strings.
2. if test_sequences is a single sequence, predicted_cluster_ids will
   be a 1-dim list of strings

Raises: TypeError: If test_sequences is of wrong type.

def parallel_predict(model, test_sequences, args, num_processes=4):
593def parallel_predict(model, test_sequences, args, num_processes=4):
594  """Run prediction in parallel using torch.multiprocessing.
595
596  This is a beta feature. It makes prediction slower on CPU. But it's reported
597  that it makes prediction faster on GPU.
598
599  Args:
600    model: instance of UISRNN model
601    test_sequences: a list of test sequences, or a single test
602      sequence. Each test sequence is a 2-dim numpy array
603      of real numbers. See `predict_single()` for details.
604    args: Inference configurations. See `arguments.py` for details.
605    num_processes: number of parallel processes.
606
607  Returns:
608    a list of the same size as test_sequences, where each element
609    being a 1-dim list of strings.
610
611  Raises:
612      TypeError: If test_sequences is of wrong type.
613  """
614  if not isinstance(test_sequences, list):
615    raise TypeError('test_sequences must be a list.')
616  ctx = multiprocessing.get_context('forkserver')
617  model.rnn_model.share_memory()
618  pool = ctx.Pool(num_processes)
619  results = pool.map(
620      functools.partial(model.predict_single, args=args),
621      test_sequences)
622  pool.close()
623  return results

Run prediction in parallel using torch.multiprocessing.

This is a beta feature. It makes prediction slower on CPU. But it's reported that it makes prediction faster on GPU.

Args: model: instance of UISRNN model test_sequences: a list of test sequences, or a single test sequence. Each test sequence is a 2-dim numpy array of real numbers. See predict_single() for details. args: Inference configurations. See arguments.py for details. num_processes: number of parallel processes.

Returns: a list of the same size as test_sequences, where each element being a 1-dim list of strings.

Raises: TypeError: If test_sequences is of wrong type.