Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 21ec0e1

Browse files
authored
Benchmark update (tensorflow#4034)
* Update the benchmark logger to have default logging. 1. Create global instance of benchmark logger, which default log to tf.logging.info 2. Allow user to config the logging location. 3. Fix nits in code and comment. * Fix lint and test error. * Address review comments. * Remove the duplicated print statement.
1 parent 823da31 commit 21ec0e1

File tree

6 files changed

+160
-66
lines changed

6 files changed

+160
-66
lines changed

official/resnet/resnet_run_loop.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -398,11 +398,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
398398
'dtype': flags.dtype
399399
})
400400

401-
if flags.benchmark_log_dir is not None:
402-
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
403-
benchmark_logger.log_run_info('resnet')
404-
else:
405-
benchmark_logger = None
401+
benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
402+
benchmark_logger.log_run_info('resnet')
406403

407404
for _ in range(flags.train_epochs // flags.epochs_between_evals):
408405
train_hooks = hooks_helper.get_train_hooks(
@@ -434,10 +431,8 @@ def input_fn_eval():
434431
# global_step count.
435432
eval_results = classifier.evaluate(input_fn=input_fn_eval,
436433
steps=flags.max_train_steps)
437-
print(eval_results)
438434

439-
if benchmark_logger:
440-
benchmark_logger.log_estimator_evaluation_result(eval_results)
435+
benchmark_logger.log_evaluation_result(eval_results)
441436

442437
if model_helpers.past_stop_threshold(
443438
flags.stop_threshold, eval_results['accuracy']):

official/utils/logs/hooks_helper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import tensorflow as tf # pylint: disable=g-bad-import-order
2828

2929
from official.utils.logs import hooks
30+
from official.utils.logs import logger
3031
from official.utils.logs import metric_hook
3132

3233
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
@@ -140,13 +141,12 @@ def get_logging_metric_hook(benchmark_log_dir=None,
140141
Returns a ProfilerHook that writes out timelines that can be loaded into
141142
profiling tools like chrome://tracing.
142143
"""
143-
if benchmark_log_dir is None:
144-
raise ValueError("metric_log_dir should be provided to use metric logger")
144+
logger.config_benchmark_logger(benchmark_log_dir)
145145
if tensors_to_log is None:
146146
tensors_to_log = _TENSORS_TO_LOG
147147
return metric_hook.LoggingMetricHook(
148148
tensors=tensors_to_log,
149-
log_dir=benchmark_log_dir,
149+
metric_logger=logger.get_benchmark_logger(),
150150
every_n_secs=every_n_secs)
151151

152152

official/utils/logs/logger.py

Lines changed: 95 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import multiprocessing
2828
import numbers
2929
import os
30+
import threading
3031

3132
import tensorflow as tf
3233
from tensorflow.python.client import device_lib
@@ -36,27 +37,48 @@
3637
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
3738

3839

39-
class BenchmarkLogger(object):
40-
"""Class to log the benchmark information to local disk."""
40+
# Don't use it directly. Use get_benchmark_logger to access a logger.
41+
_benchmark_logger = None
42+
_logger_lock = threading.Lock()
4143

42-
def __init__(self, logging_dir):
43-
self._logging_dir = logging_dir
44-
if not tf.gfile.IsDirectory(self._logging_dir):
45-
tf.gfile.MakeDirs(self._logging_dir)
4644

47-
def log_estimator_evaluation_result(self, eval_results):
48-
"""Log the evaluation result for a estimator.
45+
def config_benchmark_logger(logging_dir):
46+
"""Config the global benchmark logger"""
47+
_logger_lock.acquire()
48+
try:
49+
global _benchmark_logger
50+
if logging_dir:
51+
_benchmark_logger = BenchmarkFileLogger(logging_dir)
52+
else:
53+
_benchmark_logger = BaseBenchmarkLogger()
54+
finally:
55+
_logger_lock.release()
56+
return _benchmark_logger
57+
58+
59+
def get_benchmark_logger():
60+
if not _benchmark_logger:
61+
config_benchmark_logger(None)
62+
63+
return _benchmark_logger
64+
65+
66+
class BaseBenchmarkLogger(object):
67+
"""Class to log the benchmark information to STDOUT."""
68+
69+
def log_evaluation_result(self, eval_results):
70+
"""Log the evaluation result.
4971
50-
The evaluate result is a directory that contains metrics defined in
72+
The evaluate result is a dictionary that contains metrics defined in
5173
model_fn. It also contains a entry for global_step which contains the value
5274
of the global step when evaluation was performed.
5375
5476
Args:
55-
eval_results: dict, the result of evaluate() from a estimator.
77+
eval_results: dict, the result of evaluate.
5678
"""
5779
if not isinstance(eval_results, dict):
58-
tf.logging.warning("eval_results should be directory for logging. Got %s",
59-
type(eval_results))
80+
tf.logging.warning("eval_results should be dictionary for logging. "
81+
"Got %s", type(eval_results))
6082
return
6183
global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
6284
for key in sorted(eval_results):
@@ -81,10 +103,45 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
81103
tf.logging.warning(
82104
"Metric value to log should be a number. Got %s", type(value))
83105
return
84-
if extras:
85-
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
86-
else:
87-
extras = []
106+
extras = _convert_to_json_dict(extras)
107+
108+
tf.logging.info("Benchmark metric: "
109+
"Name %s, value %d, unit %s, global_step %d, extras %s",
110+
name, value, unit, global_step, extras)
111+
112+
def log_run_info(self, model_name):
113+
tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
114+
115+
116+
class BenchmarkFileLogger(BaseBenchmarkLogger):
117+
"""Class to log the benchmark information to local disk."""
118+
119+
def __init__(self, logging_dir):
120+
super(BenchmarkFileLogger, self).__init__()
121+
self._logging_dir = logging_dir
122+
if not tf.gfile.IsDirectory(self._logging_dir):
123+
tf.gfile.MakeDirs(self._logging_dir)
124+
125+
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
126+
"""Log the benchmark metric information to local file.
127+
128+
Currently the logging is done in a synchronized way. This should be updated
129+
to log asynchronously.
130+
131+
Args:
132+
name: string, the name of the metric to log.
133+
value: number, the value of the metric. The value will not be logged if it
134+
is not a number type.
135+
unit: string, the unit of the metric, E.g "image per second".
136+
global_step: int, the global_step when the metric is logged.
137+
extras: map of string:string, the extra information about the metric.
138+
"""
139+
if not isinstance(value, numbers.Number):
140+
tf.logging.warning(
141+
"Metric value to log should be a number. Got %s", type(value))
142+
return
143+
extras = _convert_to_json_dict(extras)
144+
88145
with tf.gfile.GFile(
89146
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
90147
metric = {
@@ -110,15 +167,7 @@ def log_run_info(self, model_name):
110167
Args:
111168
model_name: string, the name of the model.
112169
"""
113-
run_info = {
114-
"model_name": model_name,
115-
"machine_config": {},
116-
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
117-
_collect_tensorflow_info(run_info)
118-
_collect_tensorflow_environment_variables(run_info)
119-
_collect_cpu_info(run_info)
120-
_collect_gpu_info(run_info)
121-
_collect_memory_info(run_info)
170+
run_info = _gather_run_info(model_name)
122171

123172
with tf.gfile.GFile(os.path.join(
124173
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
@@ -130,6 +179,20 @@ def log_run_info(self, model_name):
130179
e)
131180

132181

182+
def _gather_run_info(model_name):
183+
"""Collect the benchmark run information for the local environment."""
184+
run_info = {
185+
"model_name": model_name,
186+
"machine_config": {},
187+
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
188+
_collect_tensorflow_info(run_info)
189+
_collect_tensorflow_environment_variables(run_info)
190+
_collect_cpu_info(run_info)
191+
_collect_gpu_info(run_info)
192+
_collect_memory_info(run_info)
193+
return run_info
194+
195+
133196
def _collect_tensorflow_info(run_info):
134197
run_info["tensorflow_version"] = {
135198
"version": tf.VERSION, "git_hash": tf.GIT_VERSION}
@@ -194,3 +257,10 @@ def _parse_gpu_model(physical_device_desc):
194257
if k.strip() == "name":
195258
return v.strip()
196259
return None
260+
261+
262+
def _convert_to_json_dict(input_dict):
263+
if input_dict:
264+
return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
265+
else:
266+
return []

official/utils/logs/logger_test.py

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,58 @@
3131

3232
class BenchmarkLoggerTest(tf.test.TestCase):
3333

34+
def test_get_default_benchmark_logger(self):
35+
self.assertIsInstance(logger.get_benchmark_logger(),
36+
logger.BaseBenchmarkLogger)
37+
38+
def test_config_base_benchmark_logger(self):
39+
logger.config_benchmark_logger("")
40+
self.assertIsInstance(logger.get_benchmark_logger(),
41+
logger.BaseBenchmarkLogger)
42+
43+
def test_config_benchmark_file_logger(self):
44+
logger.config_benchmark_logger("/tmp/abc")
45+
self.assertIsInstance(logger.get_benchmark_logger(),
46+
logger.BenchmarkFileLogger)
47+
48+
49+
class BaseBenchmarkLoggerTest(tf.test.TestCase):
50+
51+
def setUp(self):
52+
super(BaseBenchmarkLoggerTest, self).setUp()
53+
self._actual_log = tf.logging.info
54+
self.logged_message = None
55+
56+
def mock_log(*args, **kwargs):
57+
self.logged_message = args
58+
self._actual_log(*args, **kwargs)
59+
60+
tf.logging.info = mock_log
61+
62+
def tearDown(self):
63+
super(BaseBenchmarkLoggerTest, self).tearDown()
64+
tf.logging.info = self._actual_log
65+
66+
def test_log_metric(self):
67+
log = logger.BaseBenchmarkLogger()
68+
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
69+
70+
expected_log_prefix = "Benchmark metric:"
71+
self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
72+
73+
74+
class BenchmarkFileLoggerTest(tf.test.TestCase):
75+
3476
def setUp(self):
35-
super(BenchmarkLoggerTest, self).setUp()
77+
super(BenchmarkFileLoggerTest, self).setUp()
3678
# Avoid pulling extra env vars from test environment which affects the test
3779
# result, eg. Kokoro test has a TF_PKG env which affect the test case
3880
# test_collect_tensorflow_environment_variables()
3981
self.original_environ = dict(os.environ)
4082
os.environ.clear()
4183

4284
def tearDown(self):
43-
super(BenchmarkLoggerTest, self).tearDown()
85+
super(BenchmarkFileLoggerTest, self).tearDown()
4486
tf.gfile.DeleteRecursively(self.get_temp_dir())
4587
os.environ.clear()
4688
os.environ.update(self.original_environ)
@@ -49,12 +91,12 @@ def test_create_logging_dir(self):
4991
non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
5092
self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))
5193

52-
logger.BenchmarkLogger(non_exist_temp_dir)
94+
logger.BenchmarkFileLogger(non_exist_temp_dir)
5395
self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
5496

5597
def test_log_metric(self):
5698
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
57-
log = logger.BenchmarkLogger(log_dir)
99+
log = logger.BenchmarkFileLogger(log_dir)
58100
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
59101

60102
metric_log = os.path.join(log_dir, "metric.log")
@@ -69,7 +111,7 @@ def test_log_metric(self):
69111

70112
def test_log_multiple_metrics(self):
71113
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
72-
log = logger.BenchmarkLogger(log_dir)
114+
log = logger.BenchmarkFileLogger(log_dir)
73115
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
74116
log.log_metric("loss", 0.02, global_step=1e4)
75117

@@ -90,9 +132,9 @@ def test_log_multiple_metrics(self):
90132
self.assertEqual(loss["global_step"], 1e4)
91133
self.assertEqual(loss["extras"], [])
92134

93-
def test_log_non_nubmer_value(self):
135+
def test_log_non_number_value(self):
94136
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
95-
log = logger.BenchmarkLogger(log_dir)
137+
log = logger.BenchmarkFileLogger(log_dir)
96138
const = tf.constant(1)
97139
log.log_metric("accuracy", const)
98140

@@ -104,8 +146,8 @@ def test_log_evaluation_result(self):
104146
"global_step": 207082,
105147
"accuracy": 0.9285}
106148
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
107-
log = logger.BenchmarkLogger(log_dir)
108-
log.log_estimator_evaluation_result(eval_result)
149+
log = logger.BenchmarkFileLogger(log_dir)
150+
log.log_evaluation_result(eval_result)
109151

110152
metric_log = os.path.join(log_dir, "metric.log")
111153
self.assertTrue(tf.gfile.Exists(metric_log))
@@ -125,8 +167,8 @@ def test_log_evaluation_result(self):
125167
def test_log_evaluation_result_with_invalid_type(self):
126168
eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
127169
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
128-
log = logger.BenchmarkLogger(log_dir)
129-
log.log_estimator_evaluation_result(eval_result)
170+
log = logger.BenchmarkFileLogger(log_dir)
171+
log.log_evaluation_result(eval_result)
130172

131173
metric_log = os.path.join(log_dir, "metric.log")
132174
self.assertFalse(tf.gfile.Exists(metric_log))

official/utils/logs/metric_hook.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020

2121
import tensorflow as tf # pylint: disable=g-bad-import-order
2222

23-
from official.utils.logs import logger
24-
2523

2624
class LoggingMetricHook(tf.train.LoggingTensorHook):
2725
"""Hook to log benchmark metric information.
@@ -35,17 +33,15 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
3533
whose evaluation produces a side effect such as consuming additional inputs.
3634
"""
3735

38-
def __init__(self, tensors, log_dir=None, metric_logger=None,
36+
def __init__(self, tensors, metric_logger=None,
3937
every_n_iter=None, every_n_secs=None, at_end=False):
4038
"""Initializer for LoggingMetricHook.
4139
4240
Args:
4341
tensors: `dict` that maps string-valued tags to tensors/tensor names,
4442
or `iterable` of tensors/tensor names.
45-
log_dir: `string`, directory path that metric hook should write log to.
4643
metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
47-
hook should use to write the log. Exactly one of the `log_dir` and
48-
`metric_logger` should be provided.
44+
hook should use to write the log.
4945
every_n_iter: `int`, print the values of `tensors` once every N local
5046
steps taken on the current worker.
5147
every_n_secs: `int` or `float`, print the values of `tensors` once every N
@@ -66,14 +62,9 @@ def __init__(self, tensors, log_dir=None, metric_logger=None,
6662
every_n_secs=every_n_secs,
6763
at_end=at_end)
6864

69-
if (log_dir is None) == (metric_logger is None):
70-
raise ValueError(
71-
"exactly one of log_dir and metric_logger should be provided.")
72-
73-
if log_dir is not None:
74-
self._logger = logger.BenchmarkLogger(log_dir)
75-
else:
76-
self._logger = metric_logger
65+
if metric_logger is None:
66+
raise ValueError("metric_logger should be provided.")
67+
self._logger = metric_logger
7768

7869
def begin(self):
7970
super(LoggingMetricHook, self).begin()

official/utils/logs/metric_hook_test.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,8 @@ def test_illegal_args(self):
6464
tensors=['t'], every_n_iter=5, every_n_secs=5)
6565
with self.assertRaisesRegexp(ValueError, 'xactly one of'):
6666
metric_hook.LoggingMetricHook(tensors=['t'])
67-
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
67+
with self.assertRaisesRegexp(ValueError, 'metric_logger'):
6868
metric_hook.LoggingMetricHook(tensors=['t'], every_n_iter=5)
69-
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
70-
metric_hook.LoggingMetricHook(
71-
tensors=['t'], every_n_iter=5, log_dir=self._log_dir,
72-
metric_logger=self._logger)
7369

7470
def test_print_at_end_only(self):
7571
with tf.Graph().as_default(), tf.Session() as sess:

0 commit comments

Comments
 (0)