Add dataset info and hyper parameter logging for benchmark. (tensorflow#4152)

qlzh727 · web-flow · commit eb0c0dfd5261 · 2018-05-03T14:19:50.000-07:00
* Add dataset info and hyper parameter logging for benchmark.

* Address review comments.

* Address the view comment for data schema name.

* Fix test cases.

* Lint fix.
diff --git a/official/benchmark/datastore/schema/benchmark_run.json b/official/benchmark/datastore/schema/benchmark_run.json
@@ -98,41 +98,41 @@
     "type": "RECORD"
   },
   {
-    "description": "The list of hyperparameters of the model.",
+    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
     "fields": [
       {
-        "description": "The name of the hyperparameter.",
+        "description": "The name of the parameter.",
         "mode": "REQUIRED",
         "name": "name",
         "type": "STRING"
       },
       {
-        "description": "The string value of the hyperparameter.",
+        "description": "The string value of the parameter.",
         "mode": "NULLABLE",
         "name": "string_value",
         "type": "STRING"
       },
       {
-        "description": "The bool value of the hyperparameter.",
+        "description": "The bool value of the parameter.",
         "mode": "NULLABLE",
         "name": "bool_value",
         "type": "STRING"
       },
       {
-        "description": "The int/long value of the hyperparameter.",
+        "description": "The int/long value of the parameter.",
         "mode": "NULLABLE",
         "name": "long_value",
         "type": "INTEGER"
       },
       {
-        "description": "The double/float value of hyperparameter.",
+        "description": "The double/float value of parameter.",
         "mode": "NULLABLE",
         "name": "float_value",
         "type": "FLOAT"
       }
     ],
     "mode": "REPEATED",
-    "name": "hyperparameter",
+    "name": "run_parameters",
     "type": "RECORD"
   },
   {
diff --git a/official/resnet/cifar10_main.py b/official/resnet/cifar10_main.py
@@ -42,6 +42,8 @@
     'validation': 10000,
 }
 
+DATASET_NAME = 'CIFAR-10'
+
 
 ###############################################################################
 # Data processing
@@ -237,7 +239,7 @@ def run_cifar(flags_obj):
                     or input_fn)
 
   resnet_run_loop.resnet_main(
-      flags_obj, cifar10_model_fn, input_function,
+      flags_obj, cifar10_model_fn, input_function, DATASET_NAME,
       shape=[_HEIGHT, _WIDTH, _NUM_CHANNELS])
 
 
diff --git a/official/resnet/imagenet_main.py b/official/resnet/imagenet_main.py
@@ -41,6 +41,7 @@
 _NUM_TRAIN_FILES = 1024
 _SHUFFLE_BUFFER = 1500
 
+DATASET_NAME = 'ImageNet'
 
 ###############################################################################
 # Data processing
@@ -312,7 +313,7 @@ def run_imagenet(flags_obj):
                     or input_fn)
 
   resnet_run_loop.resnet_main(
-      flags_obj, imagenet_model_fn, input_function,
+      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
       shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
 
 
diff --git a/official/resnet/resnet_run_loop.py b/official/resnet/resnet_run_loop.py
@@ -331,7 +331,8 @@ def per_device_batch_size(batch_size, num_gpus):
   return int(batch_size / num_gpus)
 
 
-def resnet_main(flags_obj, model_function, input_function, shape=None):
+def resnet_main(
+    flags_obj, model_function, input_function, dataset_name, shape=None):
   """Shared main loop for ResNet Models.
 
   Args:
@@ -342,6 +343,8 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
     input_function: the function that processes the dataset and returns a
       dataset that the estimator can train on. This will be wrapped with
       all the relevant flags for running and passed to estimator.
+    dataset_name: the name of the dataset for training and evaluation. This is
+      used for logging purpose.
     shape: list of ints representing the shape of the images used for training.
       This is only used if flags_obj.export_dir is passed.
   """
@@ -381,8 +384,16 @@ def resnet_main(flags_obj, model_function, input_function, shape=None):
           'dtype': flags_core.get_tf_dtype(flags_obj)
       })
 
+  run_params = {
+      'batch_size': flags_obj.batch_size,
+      'dtype': flags_core.get_tf_dtype(flags_obj),
+      'resnet_size': flags_obj.resnet_size,
+      'resnet_version': flags_obj.version,
+      'synthetic_data': flags_obj.use_synthetic_data,
+      'train_epochs': flags_obj.train_epochs,
+  }
   benchmark_logger = logger.config_benchmark_logger(flags_obj.benchmark_log_dir)
-  benchmark_logger.log_run_info('resnet')
+  benchmark_logger.log_run_info('resnet', dataset_name, run_params)
 
   train_hooks = hooks_helper.get_train_hooks(
       flags_obj.hooks,
diff --git a/official/utils/logs/logger.py b/official/utils/logs/logger.py
@@ -109,8 +109,9 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
                     "Name %s, value %d, unit %s, global_step %d, extras %s",
                     name, value, unit, global_step, extras)
 
-  def log_run_info(self, model_name):
-    tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
+  def log_run_info(self, model_name, dataset_name, run_params):
+    tf.logging.info("Benchmark run: %s",
+                    _gather_run_info(model_name, dataset_name, run_params))
 
 
 class BenchmarkFileLogger(BaseBenchmarkLogger):
@@ -159,15 +160,18 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
         tf.logging.warning("Failed to dump metric to log file: "
                            "name %s, value %s, error %s", name, value, e)
 
-  def log_run_info(self, model_name):
+  def log_run_info(self, model_name, dataset_name, run_params):
     """Collect most of the TF runtime information for the local env.
 
     The schema of the run info follows official/benchmark/datastore/schema.
 
     Args:
       model_name: string, the name of the model.
+      dataset_name: string, the name of dataset for training and evaluation.
+      run_params: dict, the dictionary of parameters for the run, it could
+        include hyperparameters or other params that are important for the run.
     """
-    run_info = _gather_run_info(model_name)
+    run_info = _gather_run_info(model_name, dataset_name, run_params)
 
     with tf.gfile.GFile(os.path.join(
         self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
@@ -179,15 +183,17 @@ def log_run_info(self, model_name):
                            e)
 
 
-def _gather_run_info(model_name):
+def _gather_run_info(model_name, dataset_name, run_params):
   """Collect the benchmark run information for the local environment."""
   run_info = {
       "model_name": model_name,
+      "dataset": {"name": dataset_name},
       "machine_config": {},
       "run_date": datetime.datetime.utcnow().strftime(
           _DATE_TIME_FORMAT_PATTERN)}
   _collect_tensorflow_info(run_info)
   _collect_tensorflow_environment_variables(run_info)
+  _collect_run_params(run_info, run_params)
   _collect_cpu_info(run_info)
   _collect_gpu_info(run_info)
   _collect_memory_info(run_info)
@@ -199,6 +205,21 @@ def _collect_tensorflow_info(run_info):
       "version": tf.VERSION, "git_hash": tf.GIT_VERSION}
 
 
+def _collect_run_params(run_info, run_params):
+  """Log the parameter information for the benchmark run."""
+  def process_param(name, value):
+    type_check = {
+        str: {"name": name, "string_value": value},
+        int: {"name": name, "long_value": value},
+        bool: {"name": name, "bool_value": str(value)},
+        float: {"name": name, "float_value": value},
+    }
+    return type_check.get(type(value),
+                          {"name": name, "string_value": str(value)})
+  if run_params:
+    run_info["run_parameters"] = [
+        process_param(k, v) for k, v in sorted(run_params.items())]
+
 def _collect_tensorflow_environment_variables(run_info):
   run_info["tensorflow_environment_variables"] = [
       {"name": k, "value": v}
diff --git a/official/utils/logs/logger_test.py b/official/utils/logs/logger_test.py
@@ -180,6 +180,32 @@ def test_collect_tensorflow_info(self):
     self.assertEqual(run_info["tensorflow_version"]["version"], tf.VERSION)
     self.assertEqual(run_info["tensorflow_version"]["git_hash"], tf.GIT_VERSION)
 
+  def test_collect_run_params(self):
+    run_info = {}
+    run_parameters = {
+        "batch_size": 32,
+        "synthetic_data": True,
+        "train_epochs": 100.00,
+        "dtype": "fp16",
+        "resnet_size": 50,
+        "random_tensor": tf.constant(2.0)
+    }
+    logger._collect_run_params(run_info, run_parameters)
+    self.assertEqual(len(run_info["run_parameters"]), 6)
+    self.assertEqual(run_info["run_parameters"][0],
+                     {"name": "batch_size", "long_value": 32})
+    self.assertEqual(run_info["run_parameters"][1],
+                     {"name": "dtype", "string_value": "fp16"})
+    self.assertEqual(run_info["run_parameters"][2],
+                     {"name": "random_tensor", "string_value":
+                          "Tensor(\"Const:0\", shape=(), dtype=float32)"})
+    self.assertEqual(run_info["run_parameters"][3],
+                     {"name": "resnet_size", "long_value": 50})
+    self.assertEqual(run_info["run_parameters"][4],
+                     {"name": "synthetic_data", "bool_value": "True"})
+    self.assertEqual(run_info["run_parameters"][5],
+                     {"name": "train_epochs", "float_value": 100.00})
+
   def test_collect_tensorflow_environment_variables(self):
     os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
     os.environ["TF_OTHER"] = "2"