Skip to content

Commit 1b1543f

Browse files
authored
Merge pull request #3874 from alephdata/release/4.0.0
4.0.0 into develop
2 parents 5b60b4a + d452633 commit 1b1543f

File tree

114 files changed

+2009
-781
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+2009
-781
lines changed

.bumpversion.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.17.0
2+
current_version = 4.0.0
33
tag_name = {new_version}
44
commit = True
55
tag = True

.github/workflows/build.yml

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ jobs:
3434

3535
- name: Install development dependencies
3636
run: make dev
37+
env:
38+
PIP_BREAK_SYSTEM_PACKAGES: 1
3739

3840
- name: Check code formatting
3941
run: make format-check

.github/workflows/pr.yml

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ jobs:
1515

1616
- name: Install development dependencies
1717
run: make dev
18+
env:
19+
PIP_BREAK_SYSTEM_PACKAGES: 1
1820

1921
- name: Check code formatting
2022
run: make format-check

Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
FROM ubuntu:20.04
22
ENV DEBIAN_FRONTEND noninteractive
33

4-
# build-essential
4+
# build-essential
55
RUN apt-get -qq -y update \
66
&& apt-get -qq --no-install-recommends -y install locales \
77
ca-certificates postgresql-client libpq-dev curl jq \
88
python3-pip python3-icu python3-psycopg2 \
9-
python3-lxml python3-crypto \
9+
python3-lxml python3-crypto git \
1010
&& apt-get -qq -y autoremove \
1111
&& apt-get clean \
1212
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \

Makefile

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ all: build upgrade web
99

1010
services:
1111
$(COMPOSE) up -d --remove-orphans \
12-
postgres elasticsearch ingest-file
12+
postgres elasticsearch ingest-file \
13+
redis rabbitmq
1314

1415
shell: services
1516
$(APPDOCKER) /bin/bash

aleph.env.tmpl

+3
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ ALEPH_OAUTH_SECRET=
9393

9494
# Queue mechanism
9595
# REDIS_URL=redis://redis:6379/0
96+
# RABBITMQ_URL=rabbitmq
97+
# RABBITMQ_USERNAME=guest
98+
# RABBITMQ_PASSWORD=guest
9699

97100
# Content options
98101
ALEPH_OCR_DEFAULTS=eng

aleph/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
logging.getLogger("pdfminer").setLevel(logging.WARNING)
2424
logging.getLogger("httpstream").setLevel(logging.WARNING)
2525
logging.getLogger("factory").setLevel(logging.WARNING)
26+
logging.getLogger("pika").setLevel(logging.WARNING)
2627

2728
# Log all SQL statements:
2829
# logging.getLogger('sqlalchemy.engine').setLevel(log_level)

aleph/authz.py

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class Authz(object):
2626
def __init__(self, role_id, roles, is_admin=False, token_id=None, expire=None):
2727
self.id = role_id
2828
self.logged_in = role_id is not None
29+
self.auth_method = None
2930
self.roles = set(roles)
3031
self.is_admin = is_admin
3132
self.token_id = token_id

aleph/core.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from urllib.parse import urlencode, urljoin
33

44
from banal import clean_dict
5-
from elasticsearch import Elasticsearch, TransportError
65
from flask import Flask, request
76
from flask import url_for as flask_url_for
87
from flask_babel import Babel
@@ -11,13 +10,16 @@
1110
from flask_migrate import Migrate
1211
from flask_sqlalchemy import SQLAlchemy
1312
from flask_talisman import Talisman
13+
from elasticsearch import Elasticsearch, TransportError
14+
1415
from followthemoney import set_model_locale
15-
from servicelayer import settings as sls
16-
from servicelayer.archive import init_archive
1716
from servicelayer.cache import get_redis
17+
from servicelayer.archive import init_archive
1818
from servicelayer.extensions import get_extensions
19-
from servicelayer.logs import LOG_FORMAT_JSON, configure_logging
20-
from servicelayer.util import backoff, service_retries
19+
from servicelayer.util import service_retries, backoff
20+
from servicelayer.logs import configure_logging, LOG_FORMAT_JSON
21+
from servicelayer.taskqueue import get_rabbitmq_channel
22+
from servicelayer import settings as sls
2123
from werkzeug.local import LocalProxy
2224
from werkzeug.middleware.profiler import ProfilerMiddleware
2325

@@ -195,6 +197,7 @@ def get_cache():
195197
kv = LocalProxy(get_redis)
196198
cache = LocalProxy(get_cache)
197199
archive = LocalProxy(get_archive)
200+
rabbitmq_channel = LocalProxy(get_rabbitmq_channel)
198201

199202

200203
def url_for(*a, **kw):

aleph/index/entities.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,15 @@ def index_entity(entity, sync=False):
162162

163163
def index_proxy(collection, proxy, sync=False):
164164
delete_entity(proxy.id, exclude=proxy.schema, sync=False)
165-
return index_bulk(collection, [proxy], sync=sync)
165+
return index_bulk([proxy], collection=collection, sync=sync)
166166

167167

168-
def index_bulk(collection, entities, sync=False):
168+
def index_bulk(entities, collection=None, sync=False):
169169
"""Index a set of entities."""
170-
entities = (format_proxy(p, collection) for p in entities)
170+
if collection:
171+
entities = (format_proxy(p, collection) for p in entities)
172+
else:
173+
entities = (format_proxy(*p) for p in entities)
171174
entities = (e for e in entities if e is not None)
172175
bulk_actions(entities, sync=sync)
173176

aleph/index/util.py

+39-20
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import logging
1+
import structlog
22
from pprint import pprint # noqa
33
from banal import ensure_list, is_mapping
44
from elasticsearch import TransportError
@@ -9,7 +9,7 @@
99
from aleph.core import es
1010
from aleph.settings import SETTINGS
1111

12-
log = logging.getLogger(__name__)
12+
log = structlog.get_logger(__name__)
1313

1414
BULK_PAGE = 500
1515
# cf. https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html # noqa
@@ -56,6 +56,10 @@
5656
}
5757

5858

59+
class AlephOperationalException(Exception):
60+
pass
61+
62+
5963
def get_shard_weight(schema):
6064
if SETTINGS.TESTING:
6165
return 1
@@ -180,7 +184,7 @@ def query_delete(index, query, sync=False, **kwargs):
180184
request_timeout=MAX_REQUEST_TIMEOUT,
181185
timeout=MAX_TIMEOUT,
182186
scroll_size=SETTINGS.INDEX_DELETE_BY_QUERY_BATCHSIZE,
183-
**kwargs
187+
**kwargs,
184188
)
185189
return
186190
except TransportError as exc:
@@ -237,8 +241,7 @@ def _check_response(index, res):
237241
if res.get("status", 0) > 399 and not res.get("acknowledged"):
238242
error = res.get("error", {}).get("reason")
239243
log.error("Index [%s] error: %s", index, error)
240-
return False
241-
return True
244+
raise AlephOperationalException(f"Index {index} error: {error}")
242245

243246

244247
def rewrite_mapping_safe(pending, existing):
@@ -286,26 +289,42 @@ def configure_index(index, mapping, settings):
286289
"timeout": MAX_TIMEOUT,
287290
"master_timeout": MAX_TIMEOUT,
288291
}
289-
config = es.indices.get(index=index).get(index, {})
292+
res = es.indices.get(index=index)
293+
294+
if len(res) != 1:
295+
# This case should never occur.
296+
log.error("ES response", res=res)
297+
raise AlephOperationalException("ES response is empty or ambiguous.")
298+
299+
# The ES response is an object with items for every requested index. As we only request
300+
# a single index, we extract the first and only item from the response. We cannot simply
301+
# extract the response data using the index name as the name we use to request the index
302+
# may be an alias whereas the response will always contain the actual un-aliased name.
303+
config = list(res.values())[0]
304+
290305
settings.get("index").pop("number_of_shards")
306+
log.info(
307+
f"[{index}] Current settings.", index=index, settings=config.get("settings")
308+
)
291309
if check_settings_changed(settings, config.get("settings")):
292-
res = es.indices.close(ignore_unavailable=True, **options)
293-
res = es.indices.put_settings(body=settings, **options)
294-
if not _check_response(index, res):
295-
return False
310+
log.info(f"[{index}] Updated settings.", index=index, settings=settings)
311+
_check_response(index, es.indices.close(ignore_unavailable=True, **options))
312+
_check_response(index, es.indices.put_settings(body=settings, **options))
313+
else:
314+
log.info(f"[{index}] No changes detected in settings.", index=index)
315+
log.info(
316+
f"[{index}] Current mappings.", index=index, mappings=config.get("mappings")
317+
)
318+
log.info(f"[{index}] New mappings.", index=index, mappings=mapping)
296319
mapping = rewrite_mapping_safe(mapping, config.get("mappings"))
297-
res = es.indices.put_mapping(body=mapping, ignore=[400], **options)
298-
if not _check_response(index, res):
299-
return False
300-
res = es.indices.open(**options)
301-
return True
320+
_check_response(
321+
index, es.indices.put_mapping(body=mapping, ignore=[400], **options)
322+
)
323+
_check_response(index, es.indices.open(**options))
302324
else:
303-
log.info("Creating index: %s...", index)
325+
log.info(f"Creating index: {index}...", index=index)
304326
body = {"settings": settings, "mappings": mapping}
305-
res = es.indices.create(index, body=body)
306-
if not _check_response(index, res):
307-
return False
308-
return True
327+
_check_response(index, es.indices.create(index, body=body))
309328

310329

311330
def index_settings(shards=5, replicas=SETTINGS.INDEX_REPLICAS):

aleph/logic/collections.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import logging
22
from datetime import datetime
33
from collections import defaultdict
4-
from servicelayer.jobs import Job
54

65
from aleph.core import db, cache
76
from aleph.authz import Authz
@@ -14,6 +13,7 @@
1413
from aleph.logic.notifications import publish, flush_notifications
1514
from aleph.logic.documents import ingest_flush, MODEL_ORIGIN
1615
from aleph.logic.aggregator import get_aggregator
16+
from aleph.util import random_id
1717

1818
log = logging.getLogger(__name__)
1919

@@ -127,14 +127,37 @@ def _generate():
127127
yield proxy
128128
log.debug("[%s] Indexed %s entities", collection, idx)
129129

130-
entities_index.index_bulk(collection, _generate(), sync=sync)
130+
entities_index.index_bulk(_generate(), collection=collection, sync=sync)
131+
132+
133+
def index_aggregator_bulk(entity_ids, skip_errors=False, sync=False):
134+
def _generate():
135+
idx = 0
136+
for collection_id in entity_ids.keys():
137+
collection = Collection.by_id(collection_id)
138+
if collection:
139+
aggregator = get_aggregator(collection)
140+
entities = aggregator.iterate(
141+
entity_id=entity_ids[collection_id], skip_errors=skip_errors
142+
)
143+
for idx, proxy in enumerate(entities, 1):
144+
if idx > 0 and idx % 1000 == 0:
145+
log.debug("[%s] Index: %s...", collection_id, idx)
146+
yield (proxy, collection)
147+
log.debug("[%s] Indexed %s entities", collection_id, idx)
148+
else:
149+
log.debug(
150+
f"Bulk indexing for inexistent collection id: {collection_id}"
151+
)
152+
153+
entities_index.index_bulk(_generate(), sync=sync)
131154

132155

133156
def reingest_collection(
134157
collection, job_id=None, index=False, flush=True, include_ingest=False
135158
):
136159
"""Trigger a re-ingest for all documents in the collection."""
137-
job_id = job_id or Job.random_id()
160+
job_id = job_id or random_id()
138161
if flush:
139162
ingest_flush(collection, include_ingest=include_ingest)
140163
for document in Document.by_collection(collection.id):

aleph/logic/documents.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import os
22
import logging
3-
from servicelayer.jobs import Job
43

54
from aleph.core import db, archive
65
from aleph.settings import SETTINGS
76
from aleph.model import Document
87
from aleph.queues import ingest_entity
9-
from aleph.queues import OP_INGEST
108
from aleph.logic.aggregator import get_aggregator, MODEL_ORIGIN
9+
from aleph.util import random_id
1110

1211
log = logging.getLogger(__name__)
1312

@@ -17,7 +16,7 @@ def ingest_flush(collection, entity_id=None, include_ingest=False):
1716
aggregator = get_aggregator(collection)
1817
aggregator.delete(entity_id=entity_id, origin=MODEL_ORIGIN)
1918
if include_ingest:
20-
aggregator.delete(entity_id=entity_id, origin=OP_INGEST)
19+
aggregator.delete(entity_id=entity_id, origin=SETTINGS.STAGE_INGEST)
2120
for stage in SETTINGS.INGEST_PIPELINE:
2221
aggregator.delete(entity_id=entity_id, origin=stage)
2322

@@ -37,7 +36,7 @@ def crawl_directory(collection, path, parent=None, job_id=None):
3736
# to be consistent with the behaviour of alephclient
3837
if path.is_dir() and job_id is None:
3938
document = None
40-
job_id = Job.random_id()
39+
job_id = random_id()
4140
else:
4241
meta = {"file_name": path.name}
4342
document = Document.save(
@@ -48,7 +47,7 @@ def crawl_directory(collection, path, parent=None, job_id=None):
4847
meta=meta,
4948
)
5049
db.session.commit()
51-
job_id = job_id or Job.random_id()
50+
job_id = job_id or random_id()
5251
proxy = document.to_proxy()
5352
ingest_flush(collection, entity_id=proxy.id)
5453
ingest_entity(collection, proxy, job_id=job_id)

aleph/logic/entities.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from aleph.settings import SETTINGS
23
from banal import ensure_dict
34
from pprint import pformat # noqa
45
from flask_babel import gettext
@@ -10,7 +11,6 @@
1011
from aleph.model import Entity, Document, EntitySetItem, Mapping, Bookmark
1112
from aleph.index import entities as index
1213
from aleph.queues import pipeline_entity, queue_task
13-
from aleph.queues import OP_UPDATE_ENTITY, OP_PRUNE_ENTITY
1414
from aleph.logic.notifications import flush_notifications
1515
from aleph.logic.collections import refresh_collection
1616
from aleph.logic.collections import MODEL_ORIGIN
@@ -47,7 +47,9 @@ def upsert_entity(data, collection, authz=None, sync=False, sign=False, job_id=N
4747

4848
index.index_proxy(collection, proxy, sync=sync)
4949
refresh_entity(collection, proxy.id)
50-
queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id)
50+
queue_task(
51+
collection, SETTINGS.STAGE_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id
52+
)
5153
return entity.id
5254

5355

@@ -142,7 +144,9 @@ def delete_entity(collection, entity, sync=False, job_id=None):
142144
entity_id = collection.ns.sign(entity.get("id"))
143145
index.delete_entity(entity_id, sync=sync)
144146
refresh_entity(collection, entity_id)
145-
queue_task(collection, OP_PRUNE_ENTITY, job_id=job_id, entity_id=entity_id)
147+
queue_task(
148+
collection, SETTINGS.STAGE_PRUNE_ENTITY, job_id=job_id, entity_id=entity_id
149+
)
146150

147151

148152
def prune_entity(collection, entity_id=None, job_id=None):

aleph/logic/notifications.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
def channel_tag(obj, clazz=None):
2626
clazz = clazz or type(obj)
27-
if clazz == str:
27+
if clazz is str:
2828
return obj
2929

3030
obj = get_entity_id(obj)

0 commit comments

Comments
 (0)