uisrnn.contrib.range_search_crp_alpha

This module implements method to search for best crp_alpha within a range for a given data set. For example

  train_cluster_id = np.array(
    ['0_0', '0_0', '0_1', '0_1', '0_1', '0_0', '0_0', '1_0', '1_0', '1_0',
    '1_1', '1_1', '1_1', '1_0', '1_0','1_0', '1_2', '1_2', '1_2'])
  print(estimate_crp_alpha(train_cluster_id))
  0.5

Function for user: estimate_crp_alpha: see docstring for details. Internal functions: _get_cdf: see docstring for details. _get_cdf_single: see docstring for details. _get_k_t: see docstring for details. _get_n_kt: see docstring for details. _get_cluster_id_single: see docstring for details. _get_normalized_id: see docstring for details.

  1# Contributor information:
  2#   Name: Xiang Lyu
  3#   GitHub account: aluminumbox
  4#   Email: aluminumbox@alumni.sjtu.edu.cn
  5#   Organization: Ping An Technology (Shanghai) Co., Ltd.
  6"""This module implements method to search for best crp_alpha within a range for
  7 a given data set.
  8  For example
  9  ```
 10    train_cluster_id = np.array(
 11      ['0_0', '0_0', '0_1', '0_1', '0_1', '0_0', '0_0', '1_0', '1_0', '1_0',
 12      '1_1', '1_1', '1_1', '1_0', '1_0','1_0', '1_2', '1_2', '1_2'])
 13    print(estimate_crp_alpha(train_cluster_id))
 14    0.5
 15  ```
 16  Function for user:
 17    estimate_crp_alpha: see docstring for details.
 18  Internal functions:
 19    _get_cdf: see docstring for details.
 20    _get_cdf_single: see docstring for details.
 21    _get_k_t: see docstring for details.
 22    _get_n_kt: see docstring for details.
 23    _get_cluster_id_single: see docstring for details.
 24    _get_normalized_id: see docstring for details.
 25"""
 26import numpy as np
 27
 28
 29def estimate_crp_alpha(train_cluster_id, search_range=1, search_step=0.01):
 30  """Iterate through a range of alpha, return alpha with maximum cdf P{Y|Z}.
 31
 32  Args:
 33    train_cluster_id: same as train_cluster_id in demo.py. See `demo.py` for
 34      details.
 35    search_range: the range to search for crp_alpha.
 36    search_step: the step to search for crp_alpha.
 37  Returns:
 38    cur_alpha: a float variable.
 39  """
 40  cur_alpha, cur_cdf = np.nan, -np.inf
 41  for alpha in range(1, int(np.ceil(search_range / search_step))):
 42    cdf = _get_cdf(train_cluster_id, alpha * search_step)
 43    if cdf > cur_cdf:
 44      cur_alpha, cur_cdf = alpha * search_step, cdf
 45  return cur_alpha
 46
 47
 48def _get_cdf(train_cluster_id, alpha):
 49  """For a given alpha, calculate the cdf of the entire observation sequence.
 50
 51  Args:
 52    train_cluster_id: same as train_cluster_id in demo.py. See `demo.py` for
 53      details.
 54    alpha: a float variable.
 55  Returns:
 56    cdf: cdf of the entire observation sequence.
 57  """
 58  cdf = 0
 59  for cluster_id_single in _get_cluster_id_single(train_cluster_id):
 60    cdf_single = np.log(_get_cdf_single(cluster_id_single, alpha))
 61    cdf += cdf_single
 62  return cdf
 63
 64
 65def _get_cdf_single(cluster_id_single, alpha):
 66  """For a given alpha, calculate the cdf of a single observation sequence.
 67
 68  Args:
 69    cluster_id_single: train_cluster_id of a single observation sequence.
 70    alpha: a float variable.
 71  Returns:
 72    cdf_single: cdf of a single observation sequence.
 73  """
 74  k_t = _get_k_t(cluster_id_single)
 75  n_kt = _get_n_kt(cluster_id_single)
 76  numerator = alpha ** (len(set(cluster_id_single)) - 1)
 77  denominator = 1
 78  for i in range(1, len(cluster_id_single)):
 79    if cluster_id_single[i] != cluster_id_single[i - 1]:
 80      denominator_i = sum([n_kt[i - 1, j] for j in range(k_t[i - 1])
 81                           if j != cluster_id_single[i - 1]]) + alpha
 82      denominator *= denominator_i
 83  cdf_single = numerator / denominator
 84  return cdf_single
 85
 86
 87def _get_k_t(cluster_id_single):
 88  """For a single observation sequence, calculate K_t. See Eq.8 in paper.
 89
 90  Args:
 91    cluster_id_single: train_cluster_id of a single observation sequence.
 92  Returns:
 93    k_t: a numpy array.
 94  """
 95  k_t = np.array([len(set(cluster_id_single[:i + 1])) for i in
 96                  range(len(cluster_id_single))])
 97  return k_t
 98
 99
100def _get_n_kt(cluster_id_single):
101  """For a given observation sequence, calculate N_{k,t}. See Eq.8 in paper.
102
103  Args:
104    cluster_id_single: train_cluster_id of a single observation sequence.
105  Returns:
106    n_kt: a numpy array.
107  """
108  num_spk = len(set(cluster_id_single))
109  n_kt = np.zeros((len(cluster_id_single), num_spk))
110  cur_n_kt = np.zeros((num_spk))
111  for i, j in enumerate(cluster_id_single):
112    if i == 0:
113      cur_spk = j
114      cur_n_kt[j] += 1
115      continue
116    if j != cur_spk:
117      cur_spk = j
118      cur_n_kt[j] += 1
119    n_kt[i] = cur_n_kt
120  return n_kt
121
122
123def _get_cluster_id_single(train_cluster_id):
124  """Given the entire observation sequence, yields normalized id for a single
125  observation sequence each time
126
127  Args:
128    train_cluster_id: same as train_cluster_id in demo.py. See `demo.py` for
129      details.
130  Yields:
131    cluster_id_single: normalized id for a single observation sequence.
132  For example:
133  ```
134    train_cluster_id = [0_0, 0_0, 0_2, 0_2, 0_1, 0_1, 1_0, 1_1, 1_1, 1_2]
135    yields [0, 0, 1, 1, 2, 2], [0, 1, 1, 2]
136  ```
137  """
138  cur_index, cur_prefix = 0, train_cluster_id[0].split('_')[0]
139  for i, j in enumerate(train_cluster_id):
140    prefix = j.split('_')[0]
141    if prefix != cur_prefix or i == len(train_cluster_id) - 1:
142      cluster_id_single = _get_normalized_id(train_cluster_id[cur_index: i])
143      yield cluster_id_single
144      cur_index, cur_prefix = i, prefix
145
146
147def _get_normalized_id(cluster_id_single):
148  """For a single observation sequence, returns its normalized form.
149
150  Args:
151    cluster_id_single: train_cluster_id for a single observation sequence.
152  Returns:
153    normalized_id: normalized id for a single observation sequence.
154  For example:
155  ```
156    train_cluster_id = [0_0, 0_0, 0_2, 0_2, 0_1, 0_1]
157    normalized_id = [0, 0, 1, 1, 2, 2]
158  ```
159  """
160  normalized_id = [int(i.split('_')[1]) for i in cluster_id_single]
161  index_order = [np.nan] * len(set(cluster_id_single))
162  count = 0
163  for i in normalized_id:
164    if i not in index_order:
165      index_order[count] = i
166      count += 1
167    if count == len(index_order):
168      break
169  normalized_id = np.array([index_order.index(i) for i in normalized_id])
170  return normalized_id
def estimate_crp_alpha(train_cluster_id, search_range=1, search_step=0.01):
30def estimate_crp_alpha(train_cluster_id, search_range=1, search_step=0.01):
31  """Iterate through a range of alpha, return alpha with maximum cdf P{Y|Z}.
32
33  Args:
34    train_cluster_id: same as train_cluster_id in demo.py. See `demo.py` for
35      details.
36    search_range: the range to search for crp_alpha.
37    search_step: the step to search for crp_alpha.
38  Returns:
39    cur_alpha: a float variable.
40  """
41  cur_alpha, cur_cdf = np.nan, -np.inf
42  for alpha in range(1, int(np.ceil(search_range / search_step))):
43    cdf = _get_cdf(train_cluster_id, alpha * search_step)
44    if cdf > cur_cdf:
45      cur_alpha, cur_cdf = alpha * search_step, cdf
46  return cur_alpha

Iterate through a range of alpha, return alpha with maximum cdf P{Y|Z}.

Args: train_cluster_id: same as train_cluster_id in demo.py. See demo.py for details. search_range: the range to search for crp_alpha. search_step: the step to search for crp_alpha. Returns: cur_alpha: a float variable.