Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 67 additions & 6 deletions luigi/contrib/bigquery_avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ class BigQueryLoadAvro(BigQueryLoadTask):
"""A helper for loading specifically Avro data into BigQuery from GCS.

Additional goodies - takes field documentation from the input data and propagates it
to BigQuery table description and field descriptions.
to BigQuery table description and field descriptions. Supports the following Avro schema
types: Primitives, Enums, Records, Arrays, Unions, and Maps. For Map schemas nested maps
and unions are not supported. For Union Schemas only nested Primitive and Record Schemas
are currently supported.

Suitable for use via subclassing: override requires() to return Task(s) that output
to GCS Targets; their paths are expected to be URIs of .avro files or URI prefixes
Expand Down Expand Up @@ -86,10 +89,68 @@ def get_fields_with_description(bq_fields, avro_fields):
new_fields = []
for field in bq_fields:
avro_field = avro_fields[field[u'name']]
field[u'description'] = avro_field.doc
if field[u'type'] == u'RECORD' and hasattr(avro_field.type, 'fields_dict'):
field[u'fields'] = \
get_fields_with_description(field[u'fields'], avro_field.type.fields_dict)
field_type = type(avro_field.type)

# Primitive Support
if field_type is avro.schema.PrimitiveSchema:
field[u'description'] = avro_field.doc

# Enum Support
if field_type is avro.schema.EnumSchema:
field[u'description'] = avro_field.type.doc

# Record Support
if field_type is avro.schema.RecordSchema:
field[u'description'] = avro_field.type.doc
field[u'fields'] = get_fields_with_description(field[u'fields'], avro_field.type.fields_dict)

# Array Support
if field_type is avro.schema.ArraySchema:
field[u'description'] = avro_field.type.items.doc
field[u'fields'] = get_fields_with_description(field[u'fields'], avro_field.type.items.fields_dict)

# Union Support
if type(avro_field.type) is avro.schema.UnionSchema:
for schema in avro_field.type.schemas:
if type(schema) is avro.schema.PrimitiveSchema:
field[u'description'] = avro_field.doc

if type(schema) is avro.schema.RecordSchema:
field[u'description'] = schema.doc
field[u'fields'] = get_fields_with_description(field[u'fields'], schema.fields_dict)

# Support for Enums, Arrays, Maps, and Unions inside of a union is not yet implemented

# Map Support
if field_type is avro.schema.MapSchema:
field[u'description'] = avro_field.doc

# Big Query Avro loader creates artificial key and value attributes in the Big Query schema
# ignoring the key and operating directly on the value
# https://cloud.google.com/bigquery/data-formats#avro_format
bq_map_value_field = field[u'fields'][-1]
avro_map_value = avro_field.type.values
value_field_type = type(avro_map_value)

# Primitive Support: Unfortunately the map element doesn't directly have a doc attribute
# so there is no way to get documentation on the primitive types for the value attribute

if value_field_type is avro.schema.EnumSchema:
bq_map_value_field[u'description'] = avro_map_value.type.doc

if value_field_type is avro.schema.RecordSchema:
# Set values description using type's doc
bq_map_value_field[u'description'] = avro_map_value.doc

# This is jumping into the map value directly and working with that
bq_map_value_field[u'fields'] = get_fields_with_description(bq_map_value_field[u'fields'], avro_map_value.fields_dict)

if value_field_type is avro.schema.ArraySchema:
bq_map_value_field[u'description'] = avro_map_value.items.doc
bq_map_value_field[u'fields'] = get_fields_with_description(bq_map_value_field[u'fields'], avro_map_value.items.fields_dict)

# Support for unions and maps nested inside of a map is not yet implemented

new_fields.append(field)
return new_fields

Expand All @@ -113,4 +174,4 @@ def run(self):
try:
self._set_output_doc(self._get_input_schema())
except Exception as e:
logger.info('Could not propagate Avro doc to BigQuery table field descriptions: %r', e)
logger.warning('Could not propagate Avro doc to BigQuery table field descriptions: %r', e)
247 changes: 215 additions & 32 deletions test/contrib/bigquery_gcloud_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,32 +230,188 @@ class BigQueryLoadAvroTest(unittest.TestCase):
def _produce_test_input(self):
schema = avro.schema.parse("""
{
"name": "TestQueryTask_record",
"type": "record",
"doc": "The description",
"fields": [
{"name": "col0", "type": "int", "doc": "The bold"},
{"name": "col1", "type": {
"name": "inner_record",
"type": "record",
"doc": "This field shall be an inner",
"fields": [
{"name": "inner", "type": "int", "doc": "A inner field"},
{"name": "col0", "type": "int", "doc": "Same name as outer but different doc"},
{"name": "col1", "type": ["null", "string"], "default": null, "doc": "Nullable primitive"},
{"name": "col2", "type": ["null", {
"type": "map",
"values": "string"
}], "default": null, "doc": "Nullable map"}
]
}, "doc": "This field shall be an inner"},
{"name": "col2", "type": "int", "doc": "The beautiful"},
{"name": "col3", "type": "double"}
]
"type":"record",
"name":"TrackEntity2",
"namespace":"com.spotify.entity.schema",
"doc":"Track entity merged from various sources",
"fields":[
{
"name":"map_record",
"type":{
"type":"map",
"values":{
"type":"record",
"name":"MapNestedRecordObj",
"doc":"Nested Record in a map doc",
"fields":[
{
"name":"element1",
"type":"string",
"doc":"element 1 doc"
},
{
"name":"element2",
"type":[
"null",
"string"
],
"doc":"element 2 doc"
}
]
}
},
"doc":"doc for map"
},
{
"name":"additional",
"type":{
"type":"map",
"values":"string"
},
"doc":"doc for second map record"
},
{
"name":"track_gid",
"type":"string",
"doc":"Track GID in hexadecimal string"
},
{
"name":"track_uri",
"type":"string",
"doc":"Track URI in base62 string"
},
{
"name":"Suit",
"type":{
"type":"enum",
"name":"Suit",
"doc":"enum documentation broz",
"symbols":[
"SPADES",
"HEARTS",
"DIAMONDS",
"CLUBS"
]
}
},
{
"name":"FakeRecord",
"type":{
"type":"record",
"name":"FakeRecord",
"namespace":"com.spotify.data.types.coolType",
"doc":"My Fake Record doc",
"fields":[
{
"name":"coolName",
"type":"string",
"doc":"Cool Name doc"
}
]
}
},
{
"name":"master_metadata",
"type":[
"null",
{
"type":"record",
"name":"MasterMetadata",
"namespace":"com.spotify.data.types.metadata",
"doc":"metadoc",
"fields":[
{
"name":"track",
"type":[
"null",
{
"type":"record",
"name":"Track",
"doc":"Sqoop import of track",
"fields":[
{
"name":"id",
"type":[
"null",
"int"
],
"doc":"id description field",
"default":null,
"columnName":"id",
"sqlType":"4"
},
{
"name":"name",
"type":[
"null",
"string"
],
"doc":"name description field",
"default":null,
"columnName":"name",
"sqlType":"12"
}
],
"tableName":"track"
}
],
"default":null
}
]
}
]
},
{
"name":"children",
"type":{
"type":"array",
"items":{
"type":"record",
"name":"Child",
"doc":"array of children documentation",
"fields":[
{
"name":"name",
"type":"string",
"doc":"my specific child\'s doc"
}
]
}
}
}
]
}""")
self.addCleanup(os.remove, "tmp.avro")
writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema)
writer.append({'col0': 1000, 'col1': {'inner': 1234, 'col0': 3000}, 'col2': 1001, 'col3': 1.001})
writer.append({
u'track_gid': u'Cool guid',
u'map_record': {
u'Cool key': {
u'element1': u'element 1 data',
u'element2': u'element 2 data'
}
},
u'additional': {
u'key1': u'value1'
}, u'master_metadata': {
u'track': {
u'id': 1,
u'name': u'Cool Track Name'
}
}, u'track_uri': u'Totally a url here',
u'FakeRecord': {
u'coolName': u'Cool Fake Record Name'
},
u'Suit': u'DIAMONDS',
u'children': [
{
u'name': u'Bob'
},
{
u'name': u'Joe'
}
]
})
writer.close()
self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")

Expand Down Expand Up @@ -289,12 +445,39 @@ def output(_):
table = self.bq_client.client.tables().get(projectId=PROJECT_ID,
datasetId=DATASET_ID,
tableId=self.table_id).execute()
self.assertEqual(table['description'], 'The description')
self.assertEqual(table['schema']['fields'][0]['description'], 'The bold')
self.assertEqual(table['schema']['fields'][1]['description'], 'This field shall be an inner')
self.assertEqual(table['schema']['fields'][1]['fields'][0]['description'], 'A inner field')
self.assertEqual(table['schema']['fields'][1]['fields'][1]['description'], 'Same name as outer but different doc')
self.assertEqual(table['schema']['fields'][1]['fields'][2]['description'], 'Nullable primitive')
self.assertEqual(table['schema']['fields'][1]['fields'][3]['description'], 'Nullable map')
self.assertEqual(table['schema']['fields'][2]['description'], 'The beautiful')
self.assertFalse('description' in table['schema']['fields'][3])
self.assertEqual(table['description'], 'Track entity merged from various sources')
# First map
self.assertEqual(table['schema']['fields'][0]['description'], 'doc for map')
# key
self.assertFalse('description' in table['schema']['fields'][0]['fields'][0])
# Value
self.assertEqual(table['schema']['fields'][0]['fields'][1]['description'], 'Nested Record in a map doc')
# Value record data
self.assertEqual(table['schema']['fields'][0]['fields'][1]['fields'][0]['description'], 'element 1 doc')
self.assertEqual(table['schema']['fields'][0]['fields'][1]['fields'][1]['description'], 'element 2 doc')

# Second map
self.assertEqual(table['schema']['fields'][1]['description'], 'doc for second map record')
# key
self.assertFalse('description' in table['schema']['fields'][1]['fields'][0])
# Value
self.assertFalse('description' in table['schema']['fields'][1]['fields'][1])

# Several top level Primitive and Enums
self.assertEqual(table['schema']['fields'][2]['description'], 'Track GID in hexadecimal string')
self.assertEqual(table['schema']['fields'][3]['description'], 'Track URI in base62 string')
self.assertEqual(table['schema']['fields'][4]['description'], 'enum documentation broz')

# Nested Record containing primitive
self.assertEqual(table['schema']['fields'][5]['description'], 'My Fake Record doc')
self.assertEqual(table['schema']['fields'][5]['fields'][0]['description'], 'Cool Name doc')

# Union with internal Record
self.assertEqual(table['schema']['fields'][6]['description'], 'metadoc')
self.assertEqual(table['schema']['fields'][6]['fields'][0]['description'], 'Sqoop import of track')
self.assertEqual(table['schema']['fields'][6]['fields'][0]['fields'][0]['description'], 'id description field')
self.assertEqual(table['schema']['fields'][6]['fields'][0]['fields'][1]['description'], 'name description field')

# Array of Primitive
self.assertEqual(table['schema']['fields'][7]['description'], 'array of children documentation')
self.assertEqual(table['schema']['fields'][7]['fields'][0]['description'], 'my specific child\'s doc')