-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathcloud_dlp.py
More file actions
227 lines (199 loc) · 8.2 KB
/
cloud_dlp.py
File metadata and controls
227 lines (199 loc) · 8.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""``PTransforms`` that implement Google Cloud Data Loss Prevention
functionality.
"""
import logging
from typing import List
from google.cloud import dlp_v2
from apache_beam import typehints
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.transforms import DoFn
from apache_beam.transforms import ParDo
from apache_beam.transforms import PTransform
__all__ = ['MaskDetectedDetails', 'InspectForDetails']
_LOGGER = logging.getLogger(__name__)
@typehints.with_input_types(str)
@typehints.with_output_types(str)
class MaskDetectedDetails(PTransform):
"""Scrubs sensitive information detected in text.
The ``PTransform`` returns a ``PCollection`` of ``str``
Example usage::
pipeline | MaskDetectedDetails(project='example-gcp-project',
deidentification_config={
'info_type_transformations: {
'transformations': [{
'primitive_transformation': {
'character_mask_config': {
'masking_character': '#'
}
}
}]
}
}, inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
"""
def __init__(
self,
project=None,
deidentification_template_name=None,
deidentification_config=None,
inspection_template_name=None,
inspection_config=None,
timeout=None):
"""Initializes a :class:`MaskDetectedDetails` transform.
Args:
project: Optional. GCP project name in which inspection will be performed
deidentification_template_name (str): Either this or
`deidentification_config` required. Name of
deidentification template to be used on detected sensitive information
instances in text.
deidentification_config
(``Union[dict, google.cloud.dlp_v2.types.DeidentifyConfig]``):
Configuration for the de-identification of the content item.
If both template name and config are supplied,
config is more important.
inspection_template_name (str): This or `inspection_config` required.
Name of inspection template to be used
to detect sensitive data in text.
inspection_config
(``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
Configuration for the inspector used to detect sensitive data in text.
If both template name and config are supplied,
config takes precedence.
timeout (float): Optional. The amount of time, in seconds, to wait for
the request to complete.
"""
self.config = {}
self.project = project
self.timeout = timeout
if deidentification_template_name is not None \
and deidentification_config is not None:
raise ValueError(
'Both deidentification_template_name and '
'deidentification_config were specified.'
' Please specify only one of these.')
elif deidentification_template_name is None \
and deidentification_config is None:
raise ValueError(
'deidentification_template_name or '
'deidentification_config must be specified.')
elif deidentification_template_name is not None:
self.config['deidentify_template_name'] = deidentification_template_name
else:
self.config['deidentify_config'] = deidentification_config
if inspection_config is None and inspection_template_name is None:
raise ValueError(
'inspection_template_name or inspection_config must be specified')
if inspection_template_name is not None:
self.config['inspect_template_name'] = inspection_template_name
if inspection_config is not None:
self.config['inspect_config'] = inspection_config
def expand(self, pcoll):
if self.project is None:
self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project
if self.project is None:
raise ValueError(
'GCP project name needs to be specified in "project" pipeline option')
return (
pcoll
| ParDo(_DeidentifyFn(self.config, self.timeout, self.project)))
@typehints.with_input_types(str)
@typehints.with_output_types(List[dlp_v2.types.dlp.Finding])
class InspectForDetails(PTransform):
"""Inspects input text for sensitive information.
the ``PTransform`` returns a ``PCollection`` of
``List[google.cloud.dlp_v2.proto.dlp_pb2.Finding]``
Example usage::
pipeline | InspectForDetails(project='example-gcp-project',
inspection_config={'info_types': [{'name': 'EMAIL_ADDRESS'}]})
"""
def __init__(
self,
project=None,
inspection_template_name=None,
inspection_config=None,
timeout=None):
"""Initializes a :class:`InspectForDetails` transform.
Args:
project: Optional. GCP project name in which inspection will be performed
inspection_template_name (str): This or `inspection_config` required.
Name of inspection template to be used
to detect sensitive data in text.
inspection_config
(``Union[dict, google.cloud.dlp_v2.types.InspectConfig]``):
Configuration for the inspector used to detect sensitive data in text.
If both template name and config are supplied,
config takes precedence.
timeout (float): Optional. The amount of time, in seconds, to wait for
the request to complete.
"""
self.timeout = timeout
self.config = {}
self.project = project
if inspection_config is None and inspection_template_name is None:
raise ValueError(
'inspection_template_name or inspection_config must be specified')
if inspection_template_name is not None:
self.config['inspect_template_name'] = inspection_template_name
if inspection_config is not None:
self.config['inspect_config'] = inspection_config
def expand(self, pcoll):
if self.project is None:
self.project = pcoll.pipeline.options.view_as(GoogleCloudOptions).project
if self.project is None:
raise ValueError(
'GCP project name needs to be specified in "project" pipeline option')
return pcoll | ParDo(_InspectFn(self.config, self.timeout, self.project))
class _DeidentifyFn(DoFn):
def __init__(self, config=None, timeout=None, project=None, client=None):
self.config = config
self.timeout = timeout
self.client = client
self.project = project
self.params = {}
def setup(self):
if self.client is None:
self.client = dlp_v2.DlpServiceClient()
self.params = {
'timeout': self.timeout,
}
self.parent = self.client.common_project_path(self.project)
def process(self, element, **kwargs):
request = {'item': {'value': element}, 'parent': self.parent}
request.update(self.config)
operation = self.client.deidentify_content(request=request, **self.params)
yield operation.item.value
class _InspectFn(DoFn):
def __init__(self, config=None, timeout=None, project=None):
self.config = config
self.timeout = timeout
self.client = None
self.project = project
self.params = {}
def setup(self):
if self.client is None:
self.client = dlp_v2.DlpServiceClient()
self.params = {
'timeout': self.timeout,
}
self.parent = self.client.common_project_path(self.project)
def process(self, element, **kwargs):
request = {'item': {'value': element}, 'parent': self.parent}
request.update(self.config)
operation = self.client.inspect_content(request=request, **self.params)
hits = [x for x in operation.result.findings]
yield hits