uisrnn.evals

Utils for model evaluation.

 1# Copyright 2018 Google LLC
 2#
 3# Licensed under the Apache License, Version 2.0 (the "License");
 4# you may not use this file except in compliance with the License.
 5# You may obtain a copy of the License at
 6#
 7#     https://www.apache.org/licenses/LICENSE-2.0
 8#
 9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Utils for model evaluation."""
15
16from scipy import optimize
17import numpy as np
18
19
20def get_list_inverse_index(unique_ids):
21  """Get value to position index from a list of unique ids.
22
23  Args:
24    unique_ids: A list of unique integers of strings.
25
26  Returns:
27    result: a dict from value to position
28
29  Raises:
30    TypeError: If unique_ids is not a list.
31  """
32  if not isinstance(unique_ids, list):
33    raise TypeError('unique_ids must be a list')
34  result = dict()
35  for i, unique_id in enumerate(unique_ids):
36    result[unique_id] = i
37  return result
38
39
40def compute_sequence_match_accuracy(sequence1, sequence2):
41  """Compute the accuracy between two sequences by finding optimal matching.
42
43  Args:
44    sequence1: A list of integers or strings.
45    sequence2: A list of integers or strings.
46
47  Returns:
48    accuracy: sequence matching accuracy as a number in [0.0, 1.0]
49
50  Raises:
51    TypeError: If sequence1 or sequence2 is not list.
52    ValueError: If sequence1 and sequence2 are not same size.
53  """
54  if not isinstance(sequence1, list) or not isinstance(sequence2, list):
55    raise TypeError('sequence1 and sequence2 must be lists')
56  if not sequence1 or len(sequence1) != len(sequence2):
57    raise ValueError(
58        'sequence1 and sequence2 must have the same non-zero length')
59  # get unique ids from sequences
60  unique_ids1 = sorted(set(sequence1))
61  unique_ids2 = sorted(set(sequence2))
62  inverse_index1 = get_list_inverse_index(unique_ids1)
63  inverse_index2 = get_list_inverse_index(unique_ids2)
64  # get the count matrix
65  count_matrix = np.zeros((len(unique_ids1), len(unique_ids2)))
66  for item1, item2 in zip(sequence1, sequence2):
67    index1 = inverse_index1[item1]
68    index2 = inverse_index2[item2]
69    count_matrix[index1, index2] += 1.0
70  row_index, col_index = optimize.linear_sum_assignment(-count_matrix)
71  optimal_match_count = count_matrix[row_index, col_index].sum()
72  accuracy = optimal_match_count / len(sequence1)
73  return accuracy
def get_list_inverse_index(unique_ids):
21def get_list_inverse_index(unique_ids):
22  """Get value to position index from a list of unique ids.
23
24  Args:
25    unique_ids: A list of unique integers of strings.
26
27  Returns:
28    result: a dict from value to position
29
30  Raises:
31    TypeError: If unique_ids is not a list.
32  """
33  if not isinstance(unique_ids, list):
34    raise TypeError('unique_ids must be a list')
35  result = dict()
36  for i, unique_id in enumerate(unique_ids):
37    result[unique_id] = i
38  return result

Get value to position index from a list of unique ids.

Args: unique_ids: A list of unique integers of strings.

Returns: result: a dict from value to position

Raises: TypeError: If unique_ids is not a list.

def compute_sequence_match_accuracy(sequence1, sequence2):
41def compute_sequence_match_accuracy(sequence1, sequence2):
42  """Compute the accuracy between two sequences by finding optimal matching.
43
44  Args:
45    sequence1: A list of integers or strings.
46    sequence2: A list of integers or strings.
47
48  Returns:
49    accuracy: sequence matching accuracy as a number in [0.0, 1.0]
50
51  Raises:
52    TypeError: If sequence1 or sequence2 is not list.
53    ValueError: If sequence1 and sequence2 are not same size.
54  """
55  if not isinstance(sequence1, list) or not isinstance(sequence2, list):
56    raise TypeError('sequence1 and sequence2 must be lists')
57  if not sequence1 or len(sequence1) != len(sequence2):
58    raise ValueError(
59        'sequence1 and sequence2 must have the same non-zero length')
60  # get unique ids from sequences
61  unique_ids1 = sorted(set(sequence1))
62  unique_ids2 = sorted(set(sequence2))
63  inverse_index1 = get_list_inverse_index(unique_ids1)
64  inverse_index2 = get_list_inverse_index(unique_ids2)
65  # get the count matrix
66  count_matrix = np.zeros((len(unique_ids1), len(unique_ids2)))
67  for item1, item2 in zip(sequence1, sequence2):
68    index1 = inverse_index1[item1]
69    index2 = inverse_index2[item2]
70    count_matrix[index1, index2] += 1.0
71  row_index, col_index = optimize.linear_sum_assignment(-count_matrix)
72  optimal_match_count = count_matrix[row_index, col_index].sum()
73  accuracy = optimal_match_count / len(sequence1)
74  return accuracy

Compute the accuracy between two sequences by finding optimal matching.

Args: sequence1: A list of integers or strings. sequence2: A list of integers or strings.

Returns: accuracy: sequence matching accuracy as a number in [0.0, 1.0]

Raises: TypeError: If sequence1 or sequence2 is not list. ValueError: If sequence1 and sequence2 are not same size.