Skip to content

Commit 5f3884d

Browse files
authored
chore: multi confluence support (#6)
* feat: Add CommaSeparatedStrList validation tests and update project versions * feat: Enhance ConfluenceParameters model with additional confluence_kwargs field and clean up API generation script * feat: Add asynchronous document enhancement method and adjust minimum page content length * feat: Update ConfluenceParameters model and enhance validation methods for comma separated lists * feat: Update ConfluenceParameters model to use model_dump_json for JSON representation
1 parent a55f762 commit 5f3884d

25 files changed

+700
-148
lines changed

admin-api-lib/pyproject.toml

+21-12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
[build-system]
2+
requires = ["poetry-core"]
3+
build-backend = "poetry.core.masonry.api"
4+
5+
[tool.poetry]
6+
name = "admin-api-lib"
7+
version = "1.0.1"
8+
description = "The admin backend is responsible for the document management. This includes deletion, upload and returning the source document."
9+
authors = ["STACKIT Data and AI Consulting <[email protected]>"]
10+
packages = [{ include = "admin_api_lib", from = "src" }]
11+
112
[tool.flake8]
213
exclude= [".eggs", "./rag-core-library/*", "./src/admin_api_lib/models/*", "./src/admin_api_lib/rag_backend_client/*", "./src/admin_api_lib/extractor_api_client/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py"]
314
statistics = true
@@ -18,7 +29,10 @@ per-file-ignores = """
1829
./src/admin_api_lib/impl/admin_api.py: B008,
1930
./src/admin_api_lib/dependency_container.py: CCE002,CCE001,
2031
./src/admin_api_lib/apis/admin_api_base.py: WOT001,
21-
./tests/*: S101,
32+
./tests/*: S101,S106,D100,D103,PT011
33+
./src/admin_api_lib/impl/settings/confluence_settings.py: C901,N805,
34+
./src/admin_api_lib/impl/utils/comma_separated_bool_list.py: R505,
35+
./src/admin_api_lib/impl/utils/comma_separated_str_list.py: R505,
2236
"""
2337

2438
[tool.black]
@@ -51,12 +65,6 @@ known_local_folder = ["admin_api_lib", "rag_core_lib"]
5165
[tool.pylint]
5266
max-line-length = 120
5367

54-
[tool.poetry]
55-
name = "admin_api_lib"
56-
version = "0.0.1"
57-
description = "The admin backend is responsible for the document management. This includes deletion, upload and returning the source document."
58-
authors = ["STACKIT Data and AI Consulting <[email protected]>"]
59-
6068
[tool.poetry.group.dev.dependencies]
6169
debugpy = "^1.8.1"
6270
pytest = "^8.2.1"
@@ -85,11 +93,6 @@ black = "^23.9.1"
8593
# flake8-logging-format = "^2024.24.12"
8694
# flake8-docstrings = "^1.7.0"
8795

88-
89-
[build-system]
90-
requires = ["poetry-core"]
91-
build-backend = "poetry.core.masonry.api"
92-
9396
[tool.poetry.dependencies]
9497
rag-core-lib = {path = "../rag-core-lib"}
9598
python = "^3.11"
@@ -103,3 +106,9 @@ tqdm = "^4.66.4"
103106
langfuse = "^2.39.1"
104107
redis = "^5.0.8"
105108
pyyaml = "^6.0.2"
109+
110+
[tool.pytest.ini_options]
111+
log_cli = 1
112+
log_cli_level = "DEBUG"
113+
pythonpath = "src"
114+
testpaths = "src/tests"

admin-api-lib/src/admin_api_lib/dependency_container.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class DependencyContainer(DeclarativeContainer):
148148
)
149149

150150
summary_enhancer = List(
151-
Singleton(PageSummaryEnhancer, summarizer),
151+
Singleton(PageSummaryEnhancer, summarizer, chunker_settings),
152152
)
153153
untraced_information_enhancer = Singleton(
154154
GeneralEnhancer,

admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py

+17
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
from typing import Any, ClassVar, Dict, List, Optional, Set
2121

2222
from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictStr
23+
from typing import Any, ClassVar, Dict, List, Optional
24+
from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair
25+
from typing import Optional, Set
2326
from typing_extensions import Self
2427

2528

@@ -43,6 +46,9 @@ class ConfluenceParameters(BaseModel):
4346
document_name: StrictStr = Field(
4447
description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)."
4548
)
49+
confluence_kwargs: Optional[List[KeyValuePair]] = Field(
50+
default=None, description="Additional kwargs like verify_ssl"
51+
)
4652
__properties: ClassVar[List[str]] = [
4753
"url",
4854
"token",
@@ -51,6 +57,7 @@ class ConfluenceParameters(BaseModel):
5157
"keep_markdown_format",
5258
"keep_newlines",
5359
"document_name",
60+
"confluence_kwargs",
5461
]
5562

5663
model_config = ConfigDict(
@@ -89,6 +96,13 @@ def to_dict(self) -> Dict[str, Any]:
8996
exclude=excluded_fields,
9097
exclude_none=True,
9198
)
99+
# override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list)
100+
_items = []
101+
if self.confluence_kwargs:
102+
for _item_confluence_kwargs in self.confluence_kwargs:
103+
if _item_confluence_kwargs:
104+
_items.append(_item_confluence_kwargs.to_dict())
105+
_dict["confluence_kwargs"] = _items
92106
return _dict
93107

94108
@classmethod
@@ -113,6 +127,9 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
113127
else True,
114128
"keep_newlines": obj.get("keep_newlines") if obj.get("keep_newlines") is not None else True,
115129
"document_name": obj.get("document_name"),
130+
"confluence_kwargs": [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]]
131+
if obj.get("confluence_kwargs") is not None
132+
else None,
116133
}
117134
)
118135
return _obj

admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py

+63-31
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import logging
44
from asyncio import run
55
from threading import Thread
6+
import threading
67

78
from fastapi import HTTPException, status
9+
from langchain_core.documents import Document
810

911
from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader
1012
from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter
@@ -81,7 +83,6 @@ def __init__(
8183
self._extractor_api = extractor_api
8284
self._rag_api = rag_api
8385
self._settings = settings
84-
self._sanitize_document_name()
8586
self._key_value_store = key_value_store
8687
self._information_mapper = information_mapper
8788
self._information_enhancer = information_enhancer
@@ -100,10 +101,16 @@ async def aload_from_confluence(self) -> None:
100101
HTTPException
101102
If the Confluence loader is not configured or if a load is already in progress.
102103
"""
103-
if not (self._settings.url.strip() and self._settings.space_key.strip() and self._settings.token.strip()):
104-
raise HTTPException(
105-
status.HTTP_501_NOT_IMPLEMENTED, "The confluence loader is not configured! Required fields are missing."
106-
)
104+
for index in range(len(self._settings.url)):
105+
if not (
106+
self._settings.url[index].strip()
107+
and self._settings.space_key[index].strip()
108+
and self._settings.token[index].strip()
109+
):
110+
raise HTTPException(
111+
status.HTTP_501_NOT_IMPLEMENTED,
112+
"The confluence loader is not configured! Required fields are missing.",
113+
)
107114

108115
if self._background_thread is not None and self._background_thread.is_alive():
109116
raise HTTPException(
@@ -113,51 +120,76 @@ async def aload_from_confluence(self) -> None:
113120
self._background_thread.start()
114121

115122
async def _aload_from_confluence(self) -> None:
116-
params = self._settings_mapper.map_settings_to_params(self._settings)
123+
async def process_confluence(index):
124+
logger.info("Loading from Confluence %s", self._settings.url[index])
125+
self._sanitize_document_name(index=index)
126+
127+
params = self._settings_mapper.map_settings_to_params(self._settings, index)
128+
try:
129+
self._key_value_store.upsert(self._settings.document_name[index], Status.PROCESSING)
130+
information_pieces = self._extractor_api.extract_from_confluence_post(params)
131+
documents = [
132+
self._information_mapper.extractor_information_piece2document(x) for x in information_pieces
133+
]
134+
documents = await self._aenhance_langchain_documents(documents)
135+
chunked_documents = self._chunker.chunk(documents)
136+
rag_information_pieces = [
137+
self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents
138+
]
139+
except Exception as e:
140+
self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR)
141+
142+
logger.error("Error while loading from Confluence: %s", str(e))
143+
raise HTTPException(
144+
status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}"
145+
) from e
146+
147+
await self._delete_previous_information_pieces(index=index)
148+
self._key_value_store.upsert(self._settings.document_name[index], Status.UPLOADING)
149+
self._upload_information_pieces(rag_information_pieces, index=index)
150+
151+
threads = []
152+
for idx in range(len(self._settings.url)):
153+
t = threading.Thread(target=lambda idx=idx: run(process_confluence(idx)))
154+
threads.append(t)
155+
t.start()
156+
for t in threads:
157+
t.join()
158+
159+
async def _aenhance_langchain_documents(self, documents: list[Document]):
117160
try:
118-
self._key_value_store.upsert(self._settings.document_name, Status.PROCESSING)
119-
information_pieces = self._extractor_api.extract_from_confluence_post(params)
120-
documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces]
121-
chunked_documents = self._chunker.chunk(documents)
122-
rag_information_pieces = [
123-
self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents
124-
]
161+
return await self._information_enhancer.ainvoke(documents)
125162
except Exception as e:
126-
self._key_value_store.upsert(self._settings.document_name, Status.ERROR)
127-
logger.error("Error while loading from Confluence: %s", str(e))
128-
raise HTTPException(
129-
status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}"
130-
) from e
131-
132-
await self._delete_previous_information_pieces()
133-
self._key_value_store.upsert(self._settings.document_name, Status.UPLOADING)
134-
self._upload_information_pieces(rag_information_pieces)
163+
logger.error("Exception occured while enhancing confluence langchain document %s" % e)
164+
raise e
135165

136-
async def _delete_previous_information_pieces(self):
166+
async def _delete_previous_information_pieces(self, index=0):
137167
try:
138-
await self._document_deleter.adelete_document(self._settings.document_name)
168+
await self._document_deleter.adelete_document(self._settings.document_name[index])
139169
except HTTPException as e:
140170
logger.error(
141171
(
142172
"Error while trying to delete documents with id: %s before uploading %s."
143173
"NOTE: Still continuing with upload."
144174
),
145-
self._settings.document_name,
175+
self._settings.document_name[index],
146176
e,
147177
)
148178

149-
def _upload_information_pieces(self, rag_api_documents):
179+
def _upload_information_pieces(self, rag_api_documents, index=0):
150180
try:
151181
self._rag_api.upload_information_piece(rag_api_documents)
152-
self._key_value_store.upsert(self._settings.document_name, Status.READY)
182+
self._key_value_store.upsert(self._settings.document_name[index], Status.READY)
153183
logger.info("Confluence loaded successfully")
154184
except Exception as e:
155-
self._key_value_store.upsert(self._settings.document_name, Status.ERROR)
185+
self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR)
156186
logger.error("Error while uploading Confluence to the database: %s", str(e))
157187
raise HTTPException(500, f"Error loading from Confluence: {str(e)}") from e
158188

159-
def _sanitize_document_name(self) -> None:
160-
document_name = self._settings.document_name if self._settings.document_name else self._settings.url
189+
def _sanitize_document_name(self, index) -> None:
190+
document_name = (
191+
self._settings.document_name[index] if self._settings.document_name[index] else self._settings.url[index]
192+
)
161193
document_name = document_name.replace("http://", "").replace("https://", "")
162194

163-
self._settings.document_name = sanitize_document_name(document_name)
195+
self._settings.document_name[index] = sanitize_document_name(document_name)

admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,15 @@ class PageSummaryEnhancer(SummaryEnhancer):
2929

3030
async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
3131
# group infos by page, defaulting to page 1 if no page metadata
32+
if self._chunker_settings:
33+
filtered_information = [
34+
info for info in information if len(info.page_content) > self._chunker_settings.max_size
35+
]
36+
else:
37+
filtered_information = information
3238
grouped = [
33-
[info for info in information if info.metadata.get("page", self.DEFAULT_PAGE_NR) == page]
34-
for page in {info_piece.metadata.get("page", self.DEFAULT_PAGE_NR) for info_piece in information}
39+
[info for info in filtered_information if info.metadata.get("page", self.DEFAULT_PAGE_NR) == page]
40+
for page in {info_piece.metadata.get("page", self.DEFAULT_PAGE_NR) for info_piece in filtered_information}
3541
]
3642

3743
summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)]

admin-api-lib/src/admin_api_lib/impl/information_enhancer/summary_enhancer.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from abc import abstractmethod
44
from typing import Optional
55

6+
from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings
67
from langchain_core.documents import Document
78
from langchain_core.runnables import RunnableConfig, ensure_config
89

@@ -26,7 +27,7 @@ class SummaryEnhancer(InformationEnhancer):
2627

2728
INFORMATION_METADATA_TYPE = "type"
2829

29-
def __init__(self, summarizer: Summarizer):
30+
def __init__(self, summarizer: Summarizer, chunker_settings: ChunkerSettings = None):
3031
"""
3132
Initialize the SummaryEnhancer with a given Summarizer instance.
3233
@@ -37,6 +38,7 @@ def __init__(self, summarizer: Summarizer):
3738
"""
3839
super().__init__()
3940
self._summarizer = summarizer
41+
self._chunker_settings = chunker_settings
4042

4143
@staticmethod
4244
def _is_relevant(information: Document) -> bool:

admin-api-lib/src/admin_api_lib/impl/key_db/file_status_key_value_store.py

-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ def upsert(self, file_name: str, file_status: Status) -> None:
7676
None
7777
"""
7878
self.remove(file_name)
79-
8079
self._redis.sadd(self.STORAGE_KEY, FileStatusKeyValueStore._to_str(file_name, file_status))
8180

8281
def remove(self, file_name: str) -> None:

admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class ConfluenceSettingsMapper:
1010
"""Mapper class for converting ConfluenceSettings to ConfluenceParameters."""
1111

1212
@staticmethod
13-
def map_settings_to_params(settings: ConfluenceSettings) -> ConfluenceParameters:
13+
def map_settings_to_params(settings: ConfluenceSettings, index) -> ConfluenceParameters:
1414
"""
1515
Map ConfluenceSettings to ConfluenceParameters.
1616
@@ -25,11 +25,12 @@ def map_settings_to_params(settings: ConfluenceSettings) -> ConfluenceParameters
2525
The parameters object for API consumption.
2626
"""
2727
return ConfluenceParameters(
28-
url=settings.url,
29-
token=settings.token,
30-
space_key=settings.space_key,
31-
include_attachments=settings.include_attachments,
32-
keep_markdown_format=settings.keep_markdown_format,
33-
keep_newlines=settings.keep_newlines,
34-
document_name=settings.document_name,
28+
url=settings.url[index],
29+
token=settings.token[index],
30+
space_key=settings.space_key[index],
31+
include_attachments=settings.include_attachments[index],
32+
keep_markdown_format=settings.keep_markdown_format[index],
33+
keep_newlines=settings.keep_newlines[index],
34+
document_name=settings.document_name[index],
35+
confluence_kwargs=[{"key": "verify_ssl", "value": settings.verify_ssl[index]}],
3536
)

0 commit comments

Comments
 (0)