Thanks to visit codestin.com
Credit goes to github.com

Skip to content
This repository was archived by the owner on Nov 22, 2017. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 17 additions & 29 deletions sheer/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@
from collections import OrderedDict
import json


import copy
import glob
import importlib

from csv import DictReader

from elasticsearch import Elasticsearch
from elasticsearch.exceptions import TransportError

Expand Down Expand Up @@ -51,13 +47,13 @@ def __init__(self, name, **kwargs):
def documents(self):
return self.processor_module.documents(self.name, **self.kwargs)

def mapping(self, default_mapping):
def mapping(self):
if 'mappings' in self.kwargs:
return read_json_file(self.kwargs['mappings'])
if hasattr(self.processor_module, 'mappings'):
return self.processor_module.mappings(self.name, **self.kwargs)
else:
return copy.deepcopy(default_mapping)
return None


def index_document(es, index_name, processor, document):
Expand All @@ -71,9 +67,9 @@ def index_document(es, index_name, processor, document):
# exception will be raised.
try:
es.create(index=index_name,
doc_type=processor.name,
id=document['_id'],
body=document)
doc_type=processor.name,
id=document['_id'],
body=document)
except TransportError, e:
# Elasticsearch status code 409 is DocumentAlreadyExistsException
# Anything else and we want to bail here.
Expand All @@ -83,12 +79,12 @@ def index_document(es, index_name, processor, document):
# If the document couldn't be created because it already exists,
# update it instead.
es.update(index=index_name,
doc_type=processor.name,
id=document['_id'],
body={'doc': document})
doc_type=processor.name,
id=document['_id'],
body={'doc': document})


def index_processor(es, index_name, default_mapping, processor, reindex=False):
def index_processor(es, index_name, processor, reindex=False):
"""
Index all the documents provided by the given content processor for
the given index in the given Elasticsearch instance.
Expand All @@ -108,9 +104,13 @@ def index_processor(es, index_name, default_mapping, processor, reindex=False):
# Then create the mapping if it does not exist
if not mapping:
print "creating mapping for %s (%s)" % (processor.name, processor.processor_name)
es.indices.put_mapping(index=index_name,
doc_type=processor.name,
body={processor.name: processor.mapping(default_mapping)})
# Only manually create the mapping if one is specified.
# Otherwise, let Elasticsearch create a mapping
mapping_supplied = processor.mapping()
if mapping_supplied:
es.indices.put_mapping(index=index_name,
doc_type=processor.name,
body={processor.name: mapping_supplied})

try:
# Get the document iterator from the processor.
Expand Down Expand Up @@ -150,7 +150,6 @@ def index_location(args, config):
# need to talk to elasticsearch

settings_path = os.path.join(path, '_settings/settings.json')
default_mapping_path = os.path.join(path, '_defaults/mappings.json')
processors_path = os.path.join(path, '_settings/processors.json')

es = Elasticsearch(config["elasticsearch"])
Expand Down Expand Up @@ -194,22 +193,11 @@ def index_location(args, config):
processor="sheer.processors.filesystem")
processors.append(ContentProcessor(processor_name, **processor_args))

# Load default mapping (or not)
if os.path.exists(default_mapping_path):
try:
default_mapping = read_json_file(default_mapping_path)
except ValueError:
sys.exit("default mapping present, but is not valid JSON")

else:
default_mapping = {}

# If any specific content processors were selected, we run them. Otherwise
# we run all of them.
selected_processors = processors
if args.processors and len(args.processors) > 0:
selected_processors = [p for p in processors if p.name in args.processors]

for processor in selected_processors:
index_processor(es, index_name, default_mapping, processor, reindex=args.reindex)

index_processor(es, index_name, processor, reindex=args.reindex)
13 changes: 0 additions & 13 deletions sheer/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,6 @@ def test_indexing(self, mock_exists, mock_read_json_file,
index_location(test_args, self.config)

mock_es.indices.create.assert_called_with(index=self.config['index'])
mock_es.indices.put_mapping.assert_called_with(
index=self.config['index'],
doc_type='posts',
body={'posts': {}})
mock_es.create.assert_called_with(
index=self.config['index'],
doc_type='posts',
Expand Down Expand Up @@ -125,10 +121,6 @@ def test_reindexing(self, mock_exists, mock_read_json_file,

mock_es.indices.delete.assert_called_with(self.config['index'])
mock_es.indices.create.assert_called_with(index=self.config['index'])
mock_es.indices.put_mapping.assert_called_with(
index=self.config['index'],
doc_type='posts',
body={'posts': {}})
mock_es.create.assert_called_with(
index=self.config['index'],
doc_type='posts',
Expand Down Expand Up @@ -214,11 +206,6 @@ def test_partial_reindexing(self, mock_exists, mock_read_json_file,
mock_es.indices.delete_mapping.assert_called_with(
index=self.config['index'],
doc_type='posts')
mock_es.indices.put_mapping.assert_called_with(
index=self.config['index'],
doc_type='posts',
body={'posts': {}})

mock_es.create.assert_called_with(
index=self.config['index'],
doc_type='posts',
Expand Down