cfpb · kurtrwall · May 6, 2015 · Apr 29, 2015
diff --git a/sheer/indexer.py b/sheer/indexer.py
@@ -12,13 +12,9 @@
     from collections import OrderedDict
     import json
 
-
-import copy
 import glob
 import importlib
 
-from csv import DictReader
-
 from elasticsearch import Elasticsearch
 from elasticsearch.exceptions import TransportError
 
@@ -51,13 +47,13 @@ def __init__(self, name, **kwargs):
     def documents(self):
         return self.processor_module.documents(self.name, **self.kwargs)
 
-    def mapping(self, default_mapping):
+    def mapping(self):
         if 'mappings' in self.kwargs:
             return read_json_file(self.kwargs['mappings'])
         if hasattr(self.processor_module, 'mappings'):
             return self.processor_module.mappings(self.name, **self.kwargs)
         else:
-            return copy.deepcopy(default_mapping)
+            return None
 
 
 def index_document(es, index_name, processor, document):
@@ -71,9 +67,9 @@ def index_document(es, index_name, processor, document):
     # exception will be raised.
     try:
         es.create(index=index_name,
-                doc_type=processor.name,
-                id=document['_id'],
-                body=document)
+                  doc_type=processor.name,
+                  id=document['_id'],
+                  body=document)
     except TransportError, e:
         # Elasticsearch status code 409 is DocumentAlreadyExistsException
         # Anything else and we want to bail here.
@@ -83,12 +79,12 @@ def index_document(es, index_name, processor, document):
         # If the document couldn't be created because it already exists,
         # update it instead.
         es.update(index=index_name,
-                doc_type=processor.name,
-                id=document['_id'],
-                body={'doc': document})
+                  doc_type=processor.name,
+                  id=document['_id'],
+                  body={'doc': document})
 
 
-def index_processor(es, index_name, default_mapping, processor, reindex=False):
+def index_processor(es, index_name, processor, reindex=False):
     """
     Index all the documents provided by the given content processor for
     the given index in the given Elasticsearch instance.
@@ -108,9 +104,13 @@ def index_processor(es, index_name, default_mapping, processor, reindex=False):
     # Then create the mapping if it does not exist
     if not mapping:
         print "creating mapping for %s (%s)" % (processor.name, processor.processor_name)
-        es.indices.put_mapping(index=index_name,
-                            doc_type=processor.name,
-                            body={processor.name: processor.mapping(default_mapping)})
+        # Only manually create the mapping if one is specified.
+        # Otherwise, let Elasticsearch create a mapping
+        mapping_supplied = processor.mapping()
+        if mapping_supplied:
+            es.indices.put_mapping(index=index_name,
+                                   doc_type=processor.name,
+                                   body={processor.name: mapping_supplied})
 
     try:
         # Get the document iterator from the processor.
@@ -150,7 +150,6 @@ def index_location(args, config):
     # need to talk to elasticsearch
 
     settings_path = os.path.join(path, '_settings/settings.json')
-    default_mapping_path = os.path.join(path, '_defaults/mappings.json')
     processors_path = os.path.join(path, '_settings/processors.json')
 
     es = Elasticsearch(config["elasticsearch"])
@@ -194,22 +193,11 @@ def index_location(args, config):
                               processor="sheer.processors.filesystem")
         processors.append(ContentProcessor(processor_name, **processor_args))
 
-    # Load default mapping (or not)
-    if os.path.exists(default_mapping_path):
-        try:
-            default_mapping = read_json_file(default_mapping_path)
-        except ValueError:
-            sys.exit("default mapping present, but is not valid JSON")
-
-    else:
-        default_mapping = {}
-
     # If any specific content processors were selected, we run them. Otherwise
     # we run all of them.
     selected_processors = processors
     if args.processors and len(args.processors) > 0:
         selected_processors = [p for p in processors if p.name in args.processors]
 
     for processor in selected_processors:
-        index_processor(es, index_name, default_mapping, processor, reindex=args.reindex)
-
+        index_processor(es, index_name, processor, reindex=args.reindex)
diff --git a/sheer/test_indexing.py b/sheer/test_indexing.py
@@ -79,10 +79,6 @@ def test_indexing(self, mock_exists, mock_read_json_file,
         index_location(test_args, self.config)
 
         mock_es.indices.create.assert_called_with(index=self.config['index'])
-        mock_es.indices.put_mapping.assert_called_with(
-            index=self.config['index'],
-            doc_type='posts',
-            body={'posts': {}})
         mock_es.create.assert_called_with(
             index=self.config['index'],
             doc_type='posts',
@@ -125,10 +121,6 @@ def test_reindexing(self, mock_exists, mock_read_json_file,
 
         mock_es.indices.delete.assert_called_with(self.config['index'])
         mock_es.indices.create.assert_called_with(index=self.config['index'])
-        mock_es.indices.put_mapping.assert_called_with(
-            index=self.config['index'],
-            doc_type='posts',
-            body={'posts': {}})
         mock_es.create.assert_called_with(
             index=self.config['index'],
             doc_type='posts',
@@ -214,11 +206,6 @@ def test_partial_reindexing(self, mock_exists, mock_read_json_file,
         mock_es.indices.delete_mapping.assert_called_with(
             index=self.config['index'],
             doc_type='posts')
-        mock_es.indices.put_mapping.assert_called_with(
-            index=self.config['index'],
-            doc_type='posts',
-            body={'posts': {}})
-
         mock_es.create.assert_called_with(
             index=self.config['index'],
             doc_type='posts',