spotify · ulzha · Sep 5, 2017 · Aug 29, 2017 · Aug 30, 2017 · Aug 31, 2017
@@ -20,7 +20,10 @@ class BigQueryLoadAvro(BigQueryLoadTask):
     """A helper for loading specifically Avro data into BigQuery from GCS.
 
     Additional goodies - takes field documentation from the input data and propagates it
-    to BigQuery table description and field descriptions.
+    to BigQuery table description and field descriptions.  Supports the following Avro schema
+    types: Primitives, Enums, Records, Arrays, Unions, and Maps.  For Map schemas nested maps
+    and unions are not supported.  For Union Schemas only nested Primitive and Record Schemas
+    are currently supported.
 
     Suitable for use via subclassing: override requires() to return Task(s) that output
     to GCS Targets; their paths are expected to be URIs of .avro files or URI prefixes
@@ -86,10 +89,68 @@ def get_fields_with_description(bq_fields, avro_fields):
             new_fields = []
             for field in bq_fields:
                 avro_field = avro_fields[field[u'name']]
-                field[u'description'] = avro_field.doc
-                if field[u'type'] == u'RECORD' and hasattr(avro_field.type, 'fields_dict'):
-                    field[u'fields'] = \
-                        get_fields_with_description(field[u'fields'], avro_field.type.fields_dict)
+                field_type = type(avro_field.type)
+
+                # Primitive Support
+                if field_type is avro.schema.PrimitiveSchema:
+                    field[u'description'] = avro_field.doc
+
+                # Enum Support
+                if field_type is avro.schema.EnumSchema:
+                    field[u'description'] = avro_field.type.doc
+
+                # Record Support
+                if field_type is avro.schema.RecordSchema:
+                    field[u'description'] = avro_field.type.doc
+                    field[u'fields'] = get_fields_with_description(field[u'fields'], avro_field.type.fields_dict)
+
+                # Array Support
+                if field_type is avro.schema.ArraySchema:
+                    field[u'description'] = avro_field.type.items.doc
+                    field[u'fields'] = get_fields_with_description(field[u'fields'], avro_field.type.items.fields_dict)
+
+                # Union Support
+                if type(avro_field.type) is avro.schema.UnionSchema:
+                    for schema in avro_field.type.schemas:
+                        if type(schema) is avro.schema.PrimitiveSchema:
+                            field[u'description'] = avro_field.doc
+
+                        if type(schema) is avro.schema.RecordSchema:
+                            field[u'description'] = schema.doc
+                            field[u'fields'] = get_fields_with_description(field[u'fields'], schema.fields_dict)
+
+                            # Support for Enums, Arrays, Maps, and Unions inside of a union is not yet implemented
+
+                # Map Support
+                if field_type is avro.schema.MapSchema:
+                    field[u'description'] = avro_field.doc
+
+                    # Big Query Avro loader creates artificial key and value attributes in the Big Query schema
+                    # ignoring the key and operating directly on the value
+                    # https://cloud.google.com/bigquery/data-formats#avro_format
+                    bq_map_value_field = field[u'fields'][-1]
+                    avro_map_value = avro_field.type.values
+                    value_field_type = type(avro_map_value)
+
+                    # Primitive Support: Unfortunately the map element doesn't directly have a doc attribute
+                    # so there is no way to get documentation on the primitive types for the value attribute
+
+                    if value_field_type is avro.schema.EnumSchema:
+                        bq_map_value_field[u'description'] = avro_map_value.type.doc
+
+                    if value_field_type is avro.schema.RecordSchema:
+                        # Set values description using type's doc
+                        bq_map_value_field[u'description'] = avro_map_value.doc
+
+                        # This is jumping into the map value directly and working with that
+                        bq_map_value_field[u'fields'] = get_fields_with_description(bq_map_value_field[u'fields'], avro_map_value.fields_dict)
+
+                    if value_field_type is avro.schema.ArraySchema:
+                        bq_map_value_field[u'description'] = avro_map_value.items.doc
+                        bq_map_value_field[u'fields'] = get_fields_with_description(bq_map_value_field[u'fields'], avro_map_value.items.fields_dict)
+
+                        # Support for unions and maps nested inside of a map is not yet implemented
+
                 new_fields.append(field)
             return new_fields
 
@@ -113,4 +174,4 @@ def run(self):
         try:
             self._set_output_doc(self._get_input_schema())
         except Exception as e:
-            logger.info('Could not propagate Avro doc to BigQuery table field descriptions: %r', e)
+            logger.warning('Could not propagate Avro doc to BigQuery table field descriptions: %r', e)
@@ -230,32 +230,188 @@ class BigQueryLoadAvroTest(unittest.TestCase):
     def _produce_test_input(self):
         schema = avro.schema.parse("""
         {
-            "name": "TestQueryTask_record",
-            "type": "record",
-            "doc": "The description",
-            "fields": [
-                {"name": "col0", "type": "int", "doc": "The bold"},
-                {"name": "col1", "type": {
-                    "name": "inner_record",
-                    "type": "record",
-                    "doc": "This field shall be an inner",
-                    "fields": [
-                        {"name": "inner", "type": "int", "doc": "A inner field"},
-                        {"name": "col0", "type": "int", "doc": "Same name as outer but different doc"},
-                        {"name": "col1", "type": ["null", "string"], "default": null, "doc": "Nullable primitive"},
-                        {"name": "col2", "type": ["null", {
-                            "type": "map",
-                            "values": "string"
-                        }], "default": null, "doc": "Nullable map"}
-                    ]
-                }, "doc": "This field shall be an inner"},
-                {"name": "col2", "type": "int", "doc": "The beautiful"},
-                {"name": "col3", "type": "double"}
-            ]
+          "type":"record",
+          "name":"TrackEntity2",
+          "namespace":"com.spotify.entity.schema",
+          "doc":"Track entity merged from various sources",
+          "fields":[
+            {
+              "name":"map_record",
+              "type":{
+                "type":"map",
+                "values":{
+                  "type":"record",
+                  "name":"MapNestedRecordObj",
+                  "doc":"Nested Record in a map doc",
+                  "fields":[
+                    {
+                      "name":"element1",
+                      "type":"string",
+                      "doc":"element 1 doc"
+                    },
+                    {
+                      "name":"element2",
+                      "type":[
+                        "null",
+                        "string"
+                      ],
+                      "doc":"element 2 doc"
+                    }
+                  ]
+                }
+              },
+              "doc":"doc for map"
+            },
+            {
+              "name":"additional",
+              "type":{
+                "type":"map",
+                "values":"string"
+              },
+              "doc":"doc for second map record"
+            },
+            {
+              "name":"track_gid",
+              "type":"string",
+              "doc":"Track GID in hexadecimal string"
+            },
+            {
+              "name":"track_uri",
+              "type":"string",
+              "doc":"Track URI in base62 string"
+            },
+            {
+              "name":"Suit",
+              "type":{
+                "type":"enum",
+                "name":"Suit",
+                "doc":"enum documentation broz",
+                "symbols":[
+                  "SPADES",
+                  "HEARTS",
+                  "DIAMONDS",
+                  "CLUBS"
+                ]
+              }
+            },
+            {
+              "name":"FakeRecord",
+              "type":{
+                "type":"record",
+                "name":"FakeRecord",
+                "namespace":"com.spotify.data.types.coolType",
+                "doc":"My Fake Record doc",
+                "fields":[
+                  {
+                    "name":"coolName",
+                    "type":"string",
+                    "doc":"Cool Name doc"
+                  }
+                ]
+              }
+            },
+            {
+              "name":"master_metadata",
+              "type":[
+                "null",
+                {
+                  "type":"record",
+                  "name":"MasterMetadata",
+                  "namespace":"com.spotify.data.types.metadata",
+                  "doc":"metadoc",
+                  "fields":[
+                    {
+                      "name":"track",
+                      "type":[
+                        "null",
+                        {
+                          "type":"record",
+                          "name":"Track",
+                          "doc":"Sqoop import of track",
+                          "fields":[
+                            {
+                              "name":"id",
+                              "type":[
+                                "null",
+                                "int"
+                              ],
+                              "doc":"id description field",
+                              "default":null,
+                              "columnName":"id",
+                              "sqlType":"4"
+                            },
+                            {
+                              "name":"name",
+                              "type":[
+                                "null",
+                                "string"
+                              ],
+                              "doc":"name description field",
+                              "default":null,
+                              "columnName":"name",
+                              "sqlType":"12"
+                            }
+                          ],
+                          "tableName":"track"
+                        }
+                      ],
+                      "default":null
+                    }
+                  ]
+                }
+              ]
+            },
+            {
+              "name":"children",
+              "type":{
+                "type":"array",
+                "items":{
+                  "type":"record",
+                  "name":"Child",
+                  "doc":"array of children documentation",
+                  "fields":[
+                    {
+                      "name":"name",
+                      "type":"string",
+                      "doc":"my specific child\'s doc"
+                    }
+                  ]
+                }
+              }
+            }
+          ]
         }""")
         self.addCleanup(os.remove, "tmp.avro")
         writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema)
-        writer.append({'col0': 1000, 'col1': {'inner': 1234, 'col0': 3000}, 'col2': 1001, 'col3': 1.001})
+        writer.append({
+            u'track_gid': u'Cool guid',
+            u'map_record': {
+                u'Cool key': {
+                    u'element1': u'element 1 data',
+                    u'element2': u'element 2 data'
+                }
+            },
+            u'additional': {
+                u'key1': u'value1'
+            }, u'master_metadata': {
+                u'track': {
+                    u'id': 1,
+                    u'name': u'Cool Track Name'
+                }
+            }, u'track_uri': u'Totally a url here',
+            u'FakeRecord': {
+                u'coolName': u'Cool Fake Record Name'
+            },
+            u'Suit': u'DIAMONDS',
+            u'children': [
+                {
+                    u'name': u'Bob'
+                },
+                {
+                    u'name': u'Joe'
+                }
+            ]
+        })
         writer.close()
         self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
 
@@ -289,12 +445,39 @@ def output(_):
         table = self.bq_client.client.tables().get(projectId=PROJECT_ID,
                                                    datasetId=DATASET_ID,
                                                    tableId=self.table_id).execute()
-        self.assertEqual(table['description'], 'The description')
-        self.assertEqual(table['schema']['fields'][0]['description'], 'The bold')
-        self.assertEqual(table['schema']['fields'][1]['description'], 'This field shall be an inner')
-        self.assertEqual(table['schema']['fields'][1]['fields'][0]['description'], 'A inner field')
-        self.assertEqual(table['schema']['fields'][1]['fields'][1]['description'], 'Same name as outer but different doc')
-        self.assertEqual(table['schema']['fields'][1]['fields'][2]['description'], 'Nullable primitive')
-        self.assertEqual(table['schema']['fields'][1]['fields'][3]['description'], 'Nullable map')
-        self.assertEqual(table['schema']['fields'][2]['description'], 'The beautiful')
-        self.assertFalse('description' in table['schema']['fields'][3])
+        self.assertEqual(table['description'], 'Track entity merged from various sources')
+        # First map
+        self.assertEqual(table['schema']['fields'][0]['description'], 'doc for map')
+        # key
+        self.assertFalse('description' in table['schema']['fields'][0]['fields'][0])
+        # Value
+        self.assertEqual(table['schema']['fields'][0]['fields'][1]['description'], 'Nested Record in a map doc')
+        # Value record data
+        self.assertEqual(table['schema']['fields'][0]['fields'][1]['fields'][0]['description'], 'element 1 doc')
+        self.assertEqual(table['schema']['fields'][0]['fields'][1]['fields'][1]['description'], 'element 2 doc')
+
+        # Second map
+        self.assertEqual(table['schema']['fields'][1]['description'], 'doc for second map record')
+        # key
+        self.assertFalse('description' in table['schema']['fields'][1]['fields'][0])
+        # Value
+        self.assertFalse('description' in table['schema']['fields'][1]['fields'][1])
+
+        # Several top level Primitive and Enums
+        self.assertEqual(table['schema']['fields'][2]['description'], 'Track GID in hexadecimal string')
+        self.assertEqual(table['schema']['fields'][3]['description'], 'Track URI in base62 string')
+        self.assertEqual(table['schema']['fields'][4]['description'], 'enum documentation broz')
+
+        # Nested Record containing primitive
+        self.assertEqual(table['schema']['fields'][5]['description'], 'My Fake Record doc')
+        self.assertEqual(table['schema']['fields'][5]['fields'][0]['description'], 'Cool Name doc')
+
+        # Union with internal Record
+        self.assertEqual(table['schema']['fields'][6]['description'], 'metadoc')
+        self.assertEqual(table['schema']['fields'][6]['fields'][0]['description'], 'Sqoop import of track')
+        self.assertEqual(table['schema']['fields'][6]['fields'][0]['fields'][0]['description'], 'id description field')
+        self.assertEqual(table['schema']['fields'][6]['fields'][0]['fields'][1]['description'], 'name description field')
+
+        # Array of Primitive
+        self.assertEqual(table['schema']['fields'][7]['description'], 'array of children documentation')
+        self.assertEqual(table['schema']['fields'][7]['fields'][0]['description'], 'my specific child\'s doc')