Skip to content

Commit

Permalink
Support custom analysers, add es.create_models
Browse files Browse the repository at this point in the history
To support custom analysers, it seems best to abandon model.create_all
in favour of an index-wide method (es.create_models) that sets all
mappings (and custom analysis settings) at once.

Code is largely transplanted from hypothesis/h#1825

Details:
Custom analysers(&filters&tokenisers) are shared in Elasticsearch among
all document types (models). To update a model's mappings one needs to
make sure the custom analysers are defined first, but updating those
just for one model is inelegant; The index would have to be closed for
each update, and one cannot check for duplicate definitions of analysers
as it is not visible which model defined a particular analyser. Treating
index-wide settings as per-model settings creates leaky abstractions.
  • Loading branch information
Treora committed Dec 28, 2014
1 parent 2d97f29 commit 229d841
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 24 deletions.
2 changes: 2 additions & 0 deletions annotator/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@
}
}

ANALYSIS = {}

class Annotation(es.Model):

__type__ = TYPE
__mapping__ = MAPPING
__analysis__ = ANALYSIS

def save(self, *args, **kwargs):
_add_default_permissions(self)
Expand Down
109 changes: 89 additions & 20 deletions annotator/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,63 @@ def conn(self):
self._connection = self._connect()
return self._connection

def create_models(self, models):
mappings = _compile_mappings(models)
analysis = _compile_analysis(models)

# If it does not yet exist, simply create the index
try:
response = self.conn.indices.create(self.index, ignore=400, body={
'mappings': mappings,
'settings': {'analysis': analysis},
})
return
except elasticsearch.exceptions.ConnectionError as e:
msg = ('Can not access ElasticSearch at {0}! '
'Check to ensure it is running.').format(self.host)
raise elasticsearch.exceptions.ConnectionError('N/A', msg, e)

# Bad request (400) is ignored above, to prevent warnings in the log
# when the index already exists, but the failure could be for other
# reasons. If so, raise the error here.
if 'error' in response and 'IndexAlreadyExists' not in response['error']:
raise elasticsearch.exceptions.RequestError(400, response['error'])

# Update the mappings of the existing index
self._update_analysis(analysis)
self._update_mappings(mappings)

def _update_analysis(self, analysis):
"""Update analyzers and filters"""
settings = self.conn.indices.get_settings(index=self.index)
existing = settings[self.index]['settings']['index']['analysis']
# Only bother if new settings would differ from existing settings
if not _analysis_up_to_date(existing, analysis):
try:
self.conn.indices.close(index=self.index)
self.conn.indices.put_settings(index=self.index,
body={'analysis': analysis})
finally:
self.conn.indices.open(index=self.index)

def _update_mappings(self, mappings):
"""Update mappings.
Warning: can explode because of a a MergeMappingError when mappings are incompatible"""
for doc_type, body in mappings.items():
self.conn.indices.put_mapping(
index=self.index,
doc_type=doc_type,
body=body
)

def _analysis_up_to_date(existing, analysis):
"""Tell whether existing settings are up to date"""
new_analysis = existing.copy()
for section, items in analysis.items():
new_analysis.setdefault(section,{}).update(items)
return new_analysis == existing


class _Model(dict):
"""Base class that represents a document type in an ElasticSearch index.
Expand All @@ -74,7 +131,7 @@ class _Model(dict):
__type__ -- The name of the document type
__mapping__ -- A mapping of the document's fields
Mapping: Calling create_all() will create the mapping in the index.
Mapping:
One field, 'id', is treated specially. Its value will not be stored,
but be used as the _id identifier of the document in Elasticsearch. If
an item is indexed without providing an id, the _id is automatically
Expand All @@ -87,25 +144,6 @@ class _Model(dict):
with 'analyzer':'standard'.
"""

@classmethod
def create_all(cls):
log.info("Creating index '%s'." % cls.es.index)
conn = cls.es.conn
try:
conn.indices.create(cls.es.index)
except elasticsearch.exceptions.RequestError as e:
# Reraise anything that isn't just a notification that the index
# already exists (either as index or as an alias).
if not (e.error.startswith('IndexAlreadyExistsException')
or e.error.startswith('InvalidIndexNameException')):
log.fatal("Failed to create an Elasticsearch index")
raise
log.warn("Index creation failed as index appears to already exist.")
mapping = cls.get_mapping()
conn.indices.put_mapping(index=cls.es.index,
doc_type=cls.__type__,
body=mapping)

@classmethod
def get_mapping(cls):
return {
Expand All @@ -121,6 +159,10 @@ def get_mapping(cls):
}
}

@classmethod
def get_analysis(cls):
return getattr(cls, '__analysis__', {})

@classmethod
def drop_all(cls):
if cls.es.conn.indices.exists(cls.es.index):
Expand Down Expand Up @@ -215,6 +257,33 @@ def make_model(es):
return type('Model', (_Model,), {'es': es})


def _compile_mappings(models):
"""Collect the mappings from the models"""
mappings = {}
for model in models:
mappings.update(model.get_mapping())
return mappings


def _compile_analysis(models):
"""Merge the custom analyzers and such from the models"""
analysis = {}
for model in models:
for section, items in model.get_analysis().items():
existing_items = analysis.setdefault(section, {})
for name in items:
if name in existing_items:
fmt = "Duplicate definition of 'index.analysis.{}.{}'."
msg = fmt.format(section, name)
raise RuntimeError(msg)
existing_items.update(items)
return analysis


def _csv_split(s, delimiter=','):
return [r for r in csv.reader([s], delimiter=delimiter)][0]


def _build_query(query, offset, limit):
# Create a match query for each keyword
match_clauses = [{'match': {k: v}} for k, v in iteritems(query)]
Expand Down
3 changes: 1 addition & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ def main():
es.authorization_enabled = app.config['AUTHZ_ON']

try:
annotation.Annotation.create_all()
document.Document.create_all()
es.create_models([annotation.Annotation, document.Document])
except elasticsearch.exceptions.RequestError as e:
if e.error.startswith('MergeMappingException'):
date = time.strftime('%Y-%m-%d')
Expand Down
3 changes: 1 addition & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def setup_class(cls):
document.Document.drop_all()

def setup(self):
annotation.Annotation.create_all()
document.Document.create_all()
es.create_models([annotation.Annotation, document.Document])
es.conn.cluster.health(wait_for_status='yellow')
self.cli = self.app.test_client()

Expand Down

0 comments on commit 229d841

Please sign in to comment.