-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathbigtableio.py
More file actions
392 lines (350 loc) · 14.8 KB
/
bigtableio.py
File metadata and controls
392 lines (350 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""BigTable connector
This module implements writing to BigTable tables.
The default mode is to set row data to write to BigTable tables.
The syntax supported is described here:
https://cloud.google.com/bigtable/docs/quickstart-cbt
BigTable connector can be used as main outputs. A main output
(common case) is expected to be massive and will be split into
manageable chunks and processed in parallel. In the example below
we created a list of rows then passed to the GeneratedDirectRows
DoFn to set the Cells and then we call the BigTableWriteFn to insert
those generated rows in the table.
main_table = (p
| beam.Create(self._generate())
| WriteToBigTable(project_id,
instance_id,
table_id))
"""
# pytype: skip-file
import logging
import struct
from typing import Dict
from typing import List
import apache_beam as beam
from apache_beam.internal.metrics.metric import ServiceCallMetric
from apache_beam.io.gcp import resource_identifiers
from apache_beam.metrics import Metrics
from apache_beam.metrics import monitoring_infos
from apache_beam.metrics.metric import Lineage
from apache_beam.transforms import PTransform
from apache_beam.transforms.display import DisplayDataItem
from apache_beam.transforms.external import BeamJarExpansionService
from apache_beam.transforms.external import SchemaAwareExternalTransform
from apache_beam.typehints.row_type import RowTypeConstraint
_LOGGER = logging.getLogger(__name__)
FLUSH_COUNT = 1000
MAX_ROW_BYTES = 5242880 # 5MB
try:
from google.cloud.bigtable import Client
from google.cloud.bigtable.batcher import MutationsBatcher
from google.cloud.bigtable.row import Cell
from google.cloud.bigtable.row import PartialRowData
except ImportError:
_LOGGER.warning(
'ImportError: from google.cloud.bigtable import Client', exc_info=True)
__all__ = ['WriteToBigTable', 'ReadFromBigtable']
class _BigTableWriteFn(beam.DoFn):
""" Creates the connector can call and add_row to the batcher using each
row in beam pipe line
Args:
project_id(str): GCP Project ID
instance_id(str): GCP Instance ID
table_id(str): GCP Table ID
flush_count(int): Max number of rows to flush
max_row_bytes(int) Max number of row mutations size to flush
"""
def __init__(
self, project_id, instance_id, table_id, flush_count, max_row_bytes):
""" Constructor of the Write connector of Bigtable
Args:
project_id(str): GCP Project of to write the Rows
instance_id(str): GCP Instance to write the Rows
table_id(str): GCP Table to write the `DirectRows`
flush_count(int): Max number of rows to flush
max_row_bytes(int) Max number of row mutations size to flush
"""
super().__init__()
self.beam_options = {
'project_id': project_id,
'instance_id': instance_id,
'table_id': table_id,
'flush_count': flush_count,
'max_row_bytes': max_row_bytes,
}
self.table = None
self.batcher = None
self.service_call_metric = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def __getstate__(self):
return self.beam_options
def __setstate__(self, options):
self.beam_options = options
self.table = None
self.batcher = None
self.service_call_metric = None
self.written = Metrics.counter(self.__class__, 'Written Row')
def write_mutate_metrics(self, status_list):
for status in status_list:
code = status.code if status else None
grpc_status_string = (
ServiceCallMetric.bigtable_error_code_to_grpc_status_string(code))
self.service_call_metric.call(grpc_status_string)
def start_service_call_metrics(self, project_id, instance_id, table_id):
resource = resource_identifiers.BigtableTable(
project_id, instance_id, table_id)
labels = {
monitoring_infos.SERVICE_LABEL: 'BigTable',
# TODO(JIRA-11985): Add Ptransform label.
monitoring_infos.METHOD_LABEL: 'google.bigtable.v2.MutateRows',
monitoring_infos.RESOURCE_LABEL: resource,
monitoring_infos.BIGTABLE_PROJECT_ID_LABEL: (
self.beam_options['project_id']),
monitoring_infos.INSTANCE_ID_LABEL: self.beam_options['instance_id'],
monitoring_infos.TABLE_ID_LABEL: self.beam_options['table_id']
}
return ServiceCallMetric(
request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN,
base_labels=labels)
def start_bundle(self):
if self.table is None:
client = Client(project=self.beam_options['project_id'])
instance = client.instance(self.beam_options['instance_id'])
self.table = instance.table(self.beam_options['table_id'])
self.service_call_metric = self.start_service_call_metrics(
self.beam_options['project_id'],
self.beam_options['instance_id'],
self.beam_options['table_id'])
self.batcher = MutationsBatcher(
self.table,
batch_completed_callback=self.write_mutate_metrics,
flush_count=self.beam_options['flush_count'],
max_row_bytes=self.beam_options['max_row_bytes'])
def process(self, row):
self.written.inc()
# You need to set the timestamp in the cells in this row object,
# when we do a retry we will mutating the same object, but, with this
# we are going to set our cell with new values.
# Example:
# direct_row.set_cell('cf1',
# 'field1',
# 'value1',
# timestamp=datetime.now())
self.batcher.mutate(row)
def finish_bundle(self):
if self.batcher:
self.batcher.close()
self.batcher = None
# Report Lineage metrics on write
Lineage.sinks().add(
'bigtable',
self.beam_options['project_id'],
self.beam_options['instance_id'],
self.beam_options['table_id'])
def display_data(self):
return {
'projectId': DisplayDataItem(
self.beam_options['project_id'], label='Bigtable Project Id'),
'instanceId': DisplayDataItem(
self.beam_options['instance_id'], label='Bigtable Instance Id'),
'tableId': DisplayDataItem(
self.beam_options['table_id'], label='Bigtable Table Id')
}
class WriteToBigTable(beam.PTransform):
"""A transform that writes rows to a Bigtable table.
Takes an input PCollection of `DirectRow` objects containing un-committed
mutations. For more information about this row object, visit
https://cloud.google.com/python/docs/reference/bigtable/latest/row#class-googlecloudbigtablerowdirectrowrowkey-tablenone
If flag `use_cross_language` is set to true, this transform will use the
multi-language transforms framework to inject the Java native write transform
into the pipeline.
"""
URN = "beam:schematransform:org.apache.beam:bigtable_write:v1"
def __init__(
self,
project_id,
instance_id,
table_id,
use_cross_language=False,
expansion_service=None,
flush_count=FLUSH_COUNT,
max_row_bytes=MAX_ROW_BYTES,
):
"""Initialize an WriteToBigTable transform.
:param table_id:
The ID of the table to write to.
:param instance_id:
The ID of the instance where the table resides.
:param project_id:
The GCP project ID.
:param use_cross_language:
If set to True, will use the Java native transform via cross-language.
:param expansion_service:
The address of the expansion service in the case of using cross-language.
If no expansion service is provided, will attempt to run the default GCP
expansion service.
:type flush_count: int
:param flush_count: (Optional) Max number of rows to flush.
Default is FLUSH_COUNT (1000 rows).
:type max_row_bytes: int
:param max_row_bytes: (Optional) Max number of row mutations size to flush.
Default is MAX_ROW_BYTES (5 MB).
"""
super().__init__()
self._table_id = table_id
self._instance_id = instance_id
self._project_id = project_id
self._use_cross_language = use_cross_language
if use_cross_language:
self._expansion_service = (
expansion_service or BeamJarExpansionService(
'sdks:java:io:google-cloud-platform:expansion-service:build'))
self.schematransform_config = (
SchemaAwareExternalTransform.discover_config(
self._expansion_service, self.URN))
self._flush_count = flush_count
self._max_row_bytes = max_row_bytes
def expand(self, input):
if self._use_cross_language:
external_write = SchemaAwareExternalTransform(
identifier=self.schematransform_config.identifier,
expansion_service=self._expansion_service,
rearrange_based_on_discovery=True,
table_id=self._table_id,
instance_id=self._instance_id,
project_id=self._project_id)
return (
input
| beam.ParDo(self._DirectRowMutationsToBeamRow()).with_output_types(
RowTypeConstraint.from_fields(
[("key", bytes), ("mutations", List[Dict[str, bytes]])]))
| external_write)
else:
return (
input
| beam.ParDo(
_BigTableWriteFn(
self._project_id,
self._instance_id,
self._table_id,
flush_count=self._flush_count,
max_row_bytes=self._max_row_bytes)))
class _DirectRowMutationsToBeamRow(beam.DoFn):
def process(self, direct_row):
args = {"key": direct_row.row_key, "mutations": []}
# start accumulating mutations in a list
for mutation in direct_row._get_mutations():
if mutation.__contains__("set_cell"):
mutation_dict = {
"type": b'SetCell',
"family_name": mutation.set_cell.family_name.encode('utf-8'),
"column_qualifier": mutation.set_cell.column_qualifier,
"value": mutation.set_cell.value,
"timestamp_micros": struct.pack(
'>q', mutation.set_cell.timestamp_micros)
}
elif mutation.__contains__("delete_from_column"):
mutation_dict = {
"type": b'DeleteFromColumn',
"family_name": mutation.delete_from_column.family_name.encode(
'utf-8'),
"column_qualifier": mutation.delete_from_column.column_qualifier
}
time_range = mutation.delete_from_column.time_range
if time_range.start_timestamp_micros:
mutation_dict['start_timestamp_micros'] = struct.pack(
'>q', time_range.start_timestamp_micros)
if time_range.end_timestamp_micros:
mutation_dict['end_timestamp_micros'] = struct.pack(
'>q', time_range.end_timestamp_micros)
elif mutation.__contains__("delete_from_family"):
mutation_dict = {
"type": b'DeleteFromFamily',
"family_name": mutation.delete_from_family.family_name.encode(
'utf-8')
}
elif mutation.__contains__("delete_from_row"):
mutation_dict = {"type": b'DeleteFromRow'}
else:
raise ValueError("Unexpected mutation")
args["mutations"].append(mutation_dict)
yield beam.Row(**args)
class ReadFromBigtable(PTransform):
"""Reads rows from Bigtable.
Returns a PCollection of PartialRowData objects, each representing a
Bigtable row. For more information about this row object, visit
https://cloud.google.com/python/docs/reference/bigtable/latest/row#class-googlecloudbigtablerowpartialrowdatarowkey
"""
URN = "beam:schematransform:org.apache.beam:bigtable_read:v1"
def __init__(self, project_id, instance_id, table_id, expansion_service=None):
"""Initialize a ReadFromBigtable transform.
:param table_id:
The ID of the table to read from.
:param instance_id:
The ID of the instance where the table resides.
:param project_id:
The GCP project ID.
:param expansion_service:
The address of the expansion service. If no expansion service is
provided, will attempt to run the default GCP expansion service.
"""
super().__init__()
self._table_id = table_id
self._instance_id = instance_id
self._project_id = project_id
self._expansion_service = (
expansion_service or BeamJarExpansionService(
'sdks:java:io:google-cloud-platform:expansion-service:build'))
self.schematransform_config = SchemaAwareExternalTransform.discover_config(
self._expansion_service, self.URN)
def expand(self, input):
external_read = SchemaAwareExternalTransform(
identifier=self.schematransform_config.identifier,
expansion_service=self._expansion_service,
rearrange_based_on_discovery=True,
table_id=self._table_id,
instance_id=self._instance_id,
project_id=self._project_id,
flatten=False)
return (
input.pipeline
| external_read
| beam.ParDo(self._BeamRowToPartialRowData()))
# PartialRowData has some useful methods for querying data within a row.
# To make use of those methods and to give Python users a more familiar
# object, we process each Beam Row and return a PartialRowData equivalent.
class _BeamRowToPartialRowData(beam.DoFn):
def process(self, row):
key = row.key
families = row.column_families
# initialize PartialRowData object
partial_row: PartialRowData = PartialRowData(key)
for fam_name, col_fam in families.items():
if fam_name not in partial_row.cells:
partial_row.cells[fam_name] = {}
for col_qualifier, cells in col_fam.items():
# store column qualifier as bytes to follow PartialRowData behavior
col_qualifier_bytes = col_qualifier.encode()
if col_qualifier not in partial_row.cells[fam_name]:
partial_row.cells[fam_name][col_qualifier_bytes] = []
for cell in cells:
value = cell.value
timestamp_micros = cell.timestamp_micros
partial_row.cells[fam_name][col_qualifier_bytes].append(
Cell(value, timestamp_micros))
yield partial_row