ipcoder
diff --git a/‎research/object_detection/builders/post_processing_builder.py
Lines changed: 19 additions & 7 deletions b/‎research/object_detection/builders/post_processing_builder.py
Lines changed: 19 additions & 7 deletions
diff --git a/‎research/object_detection/builders/post_processing_builder_test.py
Lines changed: 37 additions & 3 deletions b/‎research/object_detection/builders/post_processing_builder_test.py
Lines changed: 37 additions & 3 deletions
diff --git a/‎research/object_detection/core/post_processing.py
Lines changed: 106 additions & 23 deletions b/‎research/object_detection/core/post_processing.py
Lines changed: 106 additions & 23 deletions
@@ -28,8 +28,8 @@ def build(post_processing_config):
   configuration.
 
   Non-max suppression callable takes `boxes`, `scores`, and optionally
-  `clip_window`, `parallel_iterations` and `scope` as inputs. It returns
-  `nms_boxes`, `nms_scores`, `nms_nms_classes` and `num_detections`. See
+  `clip_window`, `parallel_iterations` `masks, and `scope` as inputs. It returns
+  `nms_boxes`, `nms_scores`, `nms_classes` `nms_masks` and `num_detections`. See
   post_processing.batch_multiclass_non_max_suppression for the type and shape
   of these tensors.
 
@@ -55,7 +55,8 @@ def build(post_processing_config):
   non_max_suppressor_fn = _build_non_max_suppressor(
       post_processing_config.batch_non_max_suppression)
   score_converter_fn = _build_score_converter(
-      post_processing_config.score_converter)
+      post_processing_config.score_converter,
+      post_processing_config.logit_scale)
   return non_max_suppressor_fn, score_converter_fn
 
 
@@ -87,14 +88,25 @@ def _build_non_max_suppressor(nms_config):
   return non_max_suppressor_fn
 
 
-def _build_score_converter(score_converter_config):
+def _score_converter_fn_with_logit_scale(tf_score_converter_fn, logit_scale):
+  """Create a function to scale logits then apply a Tensorflow function."""
+  def score_converter_fn(logits):
+    scaled_logits = tf.divide(logits, logit_scale, name='scale_logits')
+    return tf_score_converter_fn(scaled_logits, name='convert_scores')
+  score_converter_fn.__name__ = '%s_with_logit_scale' % (
+      tf_score_converter_fn.__name__)
+  return score_converter_fn
+
+
+def _build_score_converter(score_converter_config, logit_scale):
   """Builds score converter based on the config.
 
   Builds one of [tf.identity, tf.sigmoid, tf.softmax] score converters based on
   the config.
 
   Args:
     score_converter_config: post_processing_pb2.PostProcessing.score_converter.
+    logit_scale: temperature to use for SOFTMAX score_converter.
 
   Returns:
     Callable score converter op.
@@ -103,9 +115,9 @@ def _build_score_converter(score_converter_config):
     ValueError: On unknown score converter.
   """
   if score_converter_config == post_processing_pb2.PostProcessing.IDENTITY:
-    return tf.identity
+    return _score_converter_fn_with_logit_scale(tf.identity, logit_scale)
   if score_converter_config == post_processing_pb2.PostProcessing.SIGMOID:
-    return tf.sigmoid
+    return _score_converter_fn_with_logit_scale(tf.sigmoid, logit_scale)
   if score_converter_config == post_processing_pb2.PostProcessing.SOFTMAX:
-    return tf.nn.softmax
+    return _score_converter_fn_with_logit_scale(tf.nn.softmax, logit_scale)
   raise ValueError('Unknown score converter.')
@@ -48,7 +48,31 @@ def test_build_identity_score_converter(self):
     post_processing_config = post_processing_pb2.PostProcessing()
     text_format.Merge(post_processing_text_proto, post_processing_config)
     _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.identity)
+    self.assertEqual(score_converter.__name__, 'identity_with_logit_scale')
+
+    inputs = tf.constant([1, 1], tf.float32)
+    outputs = score_converter(inputs)
+    with self.test_session() as sess:
+      converted_scores = sess.run(outputs)
+      expected_converted_scores = sess.run(inputs)
+      self.assertAllClose(converted_scores, expected_converted_scores)
+
+  def test_build_identity_score_converter_with_logit_scale(self):
+    post_processing_text_proto = """
+      score_converter: IDENTITY
+      logit_scale: 2.0
+    """
+    post_processing_config = post_processing_pb2.PostProcessing()
+    text_format.Merge(post_processing_text_proto, post_processing_config)
+    _, score_converter = post_processing_builder.build(post_processing_config)
+    self.assertEqual(score_converter.__name__, 'identity_with_logit_scale')
+
+    inputs = tf.constant([1, 1], tf.float32)
+    outputs = score_converter(inputs)
+    with self.test_session() as sess:
+      converted_scores = sess.run(outputs)
+      expected_converted_scores = sess.run(tf.constant([.5, .5], tf.float32))
+      self.assertAllClose(converted_scores, expected_converted_scores)
 
   def test_build_sigmoid_score_converter(self):
     post_processing_text_proto = """
@@ -57,7 +81,7 @@ def test_build_sigmoid_score_converter(self):
     post_processing_config = post_processing_pb2.PostProcessing()
     text_format.Merge(post_processing_text_proto, post_processing_config)
     _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.sigmoid)
+    self.assertEqual(score_converter.__name__, 'sigmoid_with_logit_scale')
 
   def test_build_softmax_score_converter(self):
     post_processing_text_proto = """
@@ -66,7 +90,17 @@ def test_build_softmax_score_converter(self):
     post_processing_config = post_processing_pb2.PostProcessing()
     text_format.Merge(post_processing_text_proto, post_processing_config)
     _, score_converter = post_processing_builder.build(post_processing_config)
-    self.assertEqual(score_converter, tf.nn.softmax)
+    self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale')
+
+  def test_build_softmax_score_converter_with_temperature(self):
+    post_processing_text_proto = """
+      score_converter: SOFTMAX
+      logit_scale: 2.0
+    """
+    post_processing_config = post_processing_pb2.PostProcessing()
+    text_format.Merge(post_processing_text_proto, post_processing_config)
+    _, score_converter = post_processing_builder.build(post_processing_config)
+    self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale')
 
 
 if __name__ == '__main__':
 
@@ -76,8 +76,6 @@ def multiclass_non_max_suppression(boxes,
     a BoxList holding M boxes with a rank-1 scores field representing
       corresponding scores for each box with scores sorted in decreasing order
       and a rank-1 classes field representing a class label for each box.
-      If masks, keypoints, keypoint_heatmaps is not None, the boxlist will
-      contain masks, keypoints, keypoint_heatmaps corresponding to boxes.
 
   Raises:
     ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have
@@ -174,6 +172,7 @@ def batch_multiclass_non_max_suppression(boxes,
                                          change_coordinate_frame=False,
                                          num_valid_boxes=None,
                                          masks=None,
+                                         additional_fields=None,
                                          scope=None,
                                          parallel_iterations=32):
   """Multi-class version of non maximum suppression that operates on a batch.
@@ -203,11 +202,13 @@ def batch_multiclass_non_max_suppression(boxes,
       is provided)
     num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape
       [batch_size] representing the number of valid boxes to be considered
-        for each image in the batch.  This parameter allows for ignoring zero
-        paddings.
+      for each image in the batch.  This parameter allows for ignoring zero
+      paddings.
     masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width]
       float32 tensor containing box masks. `q` can be either number of classes
       or 1 depending on whether a separate mask is predicted per class.
+    additional_fields: (optional) If not None, a dictionary that maps keys to
+      tensors whose dimensions are [batch_size, num_anchors, ...].
     scope: tf scope name.
     parallel_iterations: (optional) number of batch items to process in
       parallel.
@@ -223,9 +224,13 @@ def batch_multiclass_non_max_suppression(boxes,
       [batch_size, max_detections, mask_height, mask_width] float32 tensor
       containing masks for each selected box. This is set to None if input
       `masks` is None.
+    'nmsed_additional_fields': (optional) a dictionary of
+      [batch_size, max_detections, ...] float32 tensors corresponding to the
+      tensors specified in the input `additional_fields`. This is not returned
+      if input `additional_fields` is None.
     'num_detections': A [batch_size] int32 tensor indicating the number of
       valid detections per batch item. Only the top num_detections[i] entries in
-      nms_boxes[i], nms_scores[i] and nms_class[i] are valid. the rest of the
+      nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
       entries are zero paddings.
 
   Raises:
@@ -239,6 +244,7 @@ def batch_multiclass_non_max_suppression(boxes,
                      'to the third dimension of scores')
 
   original_masks = masks
+  original_additional_fields = additional_fields
   with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'):
     boxes_shape = boxes.shape
     batch_size = boxes_shape[0].value
@@ -255,58 +261,135 @@ def batch_multiclass_non_max_suppression(boxes,
       num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors
 
     # If masks aren't provided, create dummy masks so we can only have one copy
-    # of single_image_nms_fn and discard the dummy masks after map_fn.
+    # of _single_image_nms_fn and discard the dummy masks after map_fn.
     if masks is None:
       masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0])
       masks = tf.zeros(masks_shape)
 
-    def single_image_nms_fn(args):
-      """Runs NMS on a single image and returns padded output."""
-      (per_image_boxes, per_image_scores, per_image_masks,
-       per_image_num_valid_boxes) = args
+    if additional_fields is None:
+      additional_fields = {}
+
+    def _single_image_nms_fn(args):
+      """Runs NMS on a single image and returns padded output.
+
+      Args:
+        args: A list of tensors consisting of the following:
+          per_image_boxes - A [num_anchors, q, 4] float32 tensor containing
+            detections. If `q` is 1 then same boxes are used for all classes
+            otherwise, if `q` is equal to number of classes, class-specific
+            boxes are used.
+          per_image_scores - A [num_anchors, num_classes] float32 tensor
+            containing the scores for each of the `num_anchors` detections.
+          per_image_masks - A [num_anchors, q, mask_height, mask_width] float32
+            tensor containing box masks. `q` can be either number of classes
+            or 1 depending on whether a separate mask is predicted per class.
+          per_image_additional_fields - (optional) A variable number of float32
+            tensors each with size [num_anchors, ...].
+          per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of
+            shape [batch_size] representing the number of valid boxes to be
+            considered for each image in the batch.  This parameter allows for
+            ignoring zero paddings.
+
+      Returns:
+        'nmsed_boxes': A [max_detections, 4] float32 tensor containing the
+          non-max suppressed boxes.
+        'nmsed_scores': A [max_detections] float32 tensor containing the scores
+          for the boxes.
+        'nmsed_classes': A [max_detections] float32 tensor containing the class
+          for boxes.
+        'nmsed_masks': (optional) a [max_detections, mask_height, mask_width]
+          float32 tensor containing masks for each selected box. This is set to
+          None if input `masks` is None.
+        'nmsed_additional_fields':  (optional) A variable number of float32
+          tensors each with size [max_detections, ...] corresponding to the
+          input `per_image_additional_fields`.
+        'num_detections': A [batch_size] int32 tensor indicating the number of
+          valid detections per batch item. Only the top num_detections[i]
+          entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The
+          rest of the entries are zero paddings.
+      """
+      per_image_boxes = args[0]
+      per_image_scores = args[1]
+      per_image_masks = args[2]
+      per_image_additional_fields = {
+          key: value
+          for key, value in zip(additional_fields, args[3:-1])
+      }
+      per_image_num_valid_boxes = args[-1]
       per_image_boxes = tf.reshape(
           tf.slice(per_image_boxes, 3 * [0],
                    tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4])
       per_image_scores = tf.reshape(
           tf.slice(per_image_scores, [0, 0],
                    tf.stack([per_image_num_valid_boxes, -1])),
           [-1, num_classes])
-
       per_image_masks = tf.reshape(
           tf.slice(per_image_masks, 4 * [0],
                    tf.stack([per_image_num_valid_boxes, -1, -1, -1])),
           [-1, q, per_image_masks.shape[2].value,
            per_image_masks.shape[3].value])
+      if per_image_additional_fields is not None:
+        for key, tensor in per_image_additional_fields.items():
+          additional_field_shape = tensor.get_shape()
+          additional_field_dim = len(additional_field_shape)
+          per_image_additional_fields[key] = tf.reshape(
+              tf.slice(per_image_additional_fields[key],
+                       additional_field_dim * [0],
+                       tf.stack([per_image_num_valid_boxes] +
+                                (additional_field_dim - 1) * [-1])),
+              [-1] + [dim.value for dim in additional_field_shape[1:]])
       nmsed_boxlist = multiclass_non_max_suppression(
           per_image_boxes,
           per_image_scores,
           score_thresh,
           iou_thresh,
           max_size_per_class,
           max_total_size,
-          masks=per_image_masks,
           clip_window=clip_window,
-          change_coordinate_frame=change_coordinate_frame)
+          change_coordinate_frame=change_coordinate_frame,
+          masks=per_image_masks,
+          additional_fields=per_image_additional_fields)
       padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist,
                                                          max_total_size)
       num_detections = nmsed_boxlist.num_boxes()
       nmsed_boxes = padded_boxlist.get()
       nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores)
       nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes)
       nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks)
-      return [nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks,
-              num_detections]
+      nmsed_additional_fields = [
+          padded_boxlist.get_field(key) for key in per_image_additional_fields
+      ]
+      return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] +
+              nmsed_additional_fields + [num_detections])
+
+    num_additional_fields = 0
+    if additional_fields is not None:
+      num_additional_fields = len(additional_fields)
+    num_nmsed_outputs = 4 + num_additional_fields
 
-    (batch_nmsed_boxes, batch_nmsed_scores,
-     batch_nmsed_classes, batch_nmsed_masks,
-     batch_num_detections) = tf.map_fn(
-         single_image_nms_fn,
-         elems=[boxes, scores, masks, num_valid_boxes],
-         dtype=[tf.float32, tf.float32, tf.float32, tf.float32, tf.int32],
-         parallel_iterations=parallel_iterations)
+    batch_outputs = tf.map_fn(
+        _single_image_nms_fn,
+        elems=([boxes, scores, masks] + list(additional_fields.values()) +
+               [num_valid_boxes]),
+        dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]),
+        parallel_iterations=parallel_iterations)
+
+    batch_nmsed_boxes = batch_outputs[0]
+    batch_nmsed_scores = batch_outputs[1]
+    batch_nmsed_classes = batch_outputs[2]
+    batch_nmsed_masks = batch_outputs[3]
+    batch_nmsed_additional_fields = {
+        key: value
+        for key, value in zip(additional_fields, batch_outputs[4:-1])
+    }
+    batch_num_detections = batch_outputs[-1]
 
     if original_masks is None:
       batch_nmsed_masks = None
 
+    if original_additional_fields is None:
+      batch_nmsed_additional_fields = None
+
     return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes,
-            batch_nmsed_masks, batch_num_detections)
+            batch_nmsed_masks, batch_nmsed_additional_fields,
+            batch_num_detections)