-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathlogger.py
More file actions
177 lines (148 loc) · 6.58 KB
/
logger.py
File metadata and controls
177 lines (148 loc) · 6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Python worker logging."""
# pytype: skip-file
# mypy: disallow-untyped-defs
import contextlib
import json
import logging
import threading
import traceback
from typing import Any
from typing import Dict
from typing import Iterator
from typing import List
from apache_beam.runners.worker import statesampler
# Per-thread worker information. This is used only for logging to set
# context information that changes while work items get executed:
# work_item_id, step_name, stage_name.
class _PerThreadWorkerData(threading.local):
def __init__(self) -> None:
super().__init__()
# in the list, as going up and down all the way to zero incurs several
# reallocations.
self.stack: List[Dict[str, Any]] = []
def get_data(self) -> Dict[str, Any]:
all_data = {}
for datum in self.stack:
all_data.update(datum)
return all_data
per_thread_worker_data = _PerThreadWorkerData()
@contextlib.contextmanager
def PerThreadLoggingContext(**kwargs: Any) -> Iterator[None]:
"""A context manager to add per thread attributes."""
stack = per_thread_worker_data.stack
stack.append(kwargs)
try:
yield
finally:
stack.pop()
class JsonLogFormatter(logging.Formatter):
"""A JSON formatter class as expected by the logging standard module."""
def __init__(self, job_id: str, worker_id: str) -> None:
super().__init__()
self.job_id = job_id
self.worker_id = worker_id
def format(self, record: logging.LogRecord) -> str:
"""Returns a JSON string based on a LogRecord instance.
Args:
record: A LogRecord instance. See below for details.
Returns:
A JSON string representing the record.
A LogRecord instance has the following attributes and is used for
formatting the final message.
Attributes:
created: A double representing the timestamp for record creation
(e.g., 1438365207.624597). Note that the number contains also msecs and
microsecs information. Part of this is also available in the 'msecs'
attribute.
msecs: A double representing the msecs part of the record creation
(e.g., 624.5970726013184).
msg: Logging message containing formatting instructions or an arbitrary
object. This is the first argument of a log call.
args: A tuple containing the positional arguments for the logging call.
levelname: A string. Possible values are: INFO, WARNING, ERROR, etc.
exc_info: None or a 3-tuple with exception information as it is
returned by a call to sys.exc_info().
name: Logger's name. Most logging is done using the default root logger
and therefore the name will be 'root'.
filename: Basename of the file where logging occurred.
funcName: Name of the function where logging occurred.
process: The PID of the process running the worker.
thread: An id for the thread where the record was logged. This is not a
real TID (the one provided by OS) but rather the id (address) of a
Python thread object. Nevertheless having this value can allow to
filter log statement from only one specific thread.
"""
output: Dict[str, Any] = {}
output['timestamp'] = {
'seconds': int(record.created), 'nanos': int(record.msecs * 1000000)
}
# ERROR. INFO, DEBUG log levels translate into the same for severity
# property. WARNING becomes WARN.
output['severity'] = (
record.levelname if record.levelname != 'WARNING' else 'WARN')
# msg could be an arbitrary object, convert it to a string first.
record_msg = str(record.msg)
# Prepare the actual message using the message formatting string and the
# positional arguments as they have been used in the log call.
if record.args:
try:
output['message'] = record_msg % record.args
except (TypeError, ValueError):
output['message'] = '%s with args (%s)' % (record_msg, record.args)
else:
output['message'] = record_msg
# The thread ID is logged as a combination of the process ID and thread ID
# since workers can run in multiple processes.
output['thread'] = '%s:%s' % (record.process, record.thread)
# job ID and worker ID. These do not change during the lifetime of a worker.
output['job'] = self.job_id
output['worker'] = self.worker_id
# Stage, step and work item ID come from thread local storage since they
# change with every new work item leased for execution. If there is no
# work item ID then we make sure the step is undefined too.
data = per_thread_worker_data.get_data()
if 'work_item_id' in data:
output['work'] = data['work_item_id']
tracker = statesampler.get_current_tracker()
if tracker:
output['stage'] = tracker.stage_name
if tracker.current_state() and tracker.current_state().name_context:
output['step'] = tracker.current_state().name_context.logging_name()
# All logging happens using the root logger. We will add the basename of the
# file and the function name where the logging happened to make it easier
# to identify who generated the record.
output['logger'] = '%s:%s:%s' % (
record.name, record.filename, record.funcName)
# Add exception information if any is available.
if record.exc_info:
output['exception'] = ''.join(
traceback.format_exception(*record.exc_info))
return json.dumps(output)
def initialize(
job_id: str,
worker_id: str,
log_path: str,
log_level: int = logging.INFO) -> None:
"""Initialize root logger so that we log JSON to a file and text to stdout."""
file_handler = logging.FileHandler(log_path)
file_handler.setFormatter(JsonLogFormatter(job_id, worker_id))
logging.getLogger().addHandler(file_handler)
# Default level is set to INFO to avoid logging various DEBUG level log calls
# sprinkled throughout the code.
logging.getLogger().setLevel(log_level)