From 695d9e1833cf0823965d9becdb9621cc552c2039 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 08:54:15 +0200 Subject: [PATCH 01/43] api change --- admin-api-lib/openapi.yaml | 85 ++-- .../src/admin_api_lib/apis/admin_api.py | 84 ++-- .../src/admin_api_lib/apis/admin_api_base.py | 43 +- .../openapi_client/__init__.py | 53 +-- .../openapi_client/api/__init__.py | 5 +- .../openapi_client/api/extractor_api.py | 372 ++++-------------- .../openapi_client/api_client.py | 313 ++++++++++----- .../openapi_client/api_response.py | 11 +- .../openapi_client/configuration.py | 108 +++-- .../openapi_client/exceptions.py | 40 +- .../openapi_client/models/__init__.py | 29 +- .../openapi_client/models/content_type.py | 24 +- .../models/extraction_request.py | 53 ++- .../models/information_piece.py | 56 ++- .../openapi_client/models/key_value_pair.py | 40 +- .../openapi_client/rest.py | 118 ++++-- .../openapi_client/test/test_content_type.py | 33 ++ .../test/test_extraction_request.py | 56 +++ .../openapi_client/test/test_extractor_api.py | 37 ++ .../test/test_information_piece.py | 60 +++ .../test/test_key_value_pair.py | 52 +++ .../admin_api_lib/models/document_status.py | 40 +- .../src/admin_api_lib/models/extra_models.py | 1 - .../admin_api_lib/models/key_value_pair.py | 102 +++++ .../src/admin_api_lib/models/status.py | 27 +- .../src/admin_api_lib/models/upload_source.py | 102 +++++ extractor-api-lib/openapi.yaml | 96 +---- .../extractor_api_lib/apis/extractor_api.py | 78 ++-- .../apis/extractor_api_base.py | 49 +-- .../extractor_api_lib/models/content_type.py | 25 +- .../extractor_api_lib/models/extra_models.py | 1 - .../models/extraction_request.py | 54 ++- .../models/information_piece.py | 51 ++- .../models/key_value_pair.py | 43 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 35 +- .../src/rag_core_api/apis/rag_api_base.py | 13 +- .../src/rag_core_api/models/chat_history.py | 49 ++- .../models/chat_history_message.py | 40 +- .../src/rag_core_api/models/chat_request.py | 47 +-- .../src/rag_core_api/models/chat_response.py | 53 ++- .../src/rag_core_api/models/chat_role.py | 23 +- .../src/rag_core_api/models/content_type.py | 27 +- .../src/rag_core_api/models/delete_request.py | 49 ++- .../src/rag_core_api/models/extra_models.py | 1 - .../rag_core_api/models/information_piece.py | 55 ++- .../src/rag_core_api/models/key_value_pair.py | 37 +- 46 files changed, 1527 insertions(+), 1243 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py create mode 100644 admin-api-lib/src/admin_api_lib/models/key_value_pair.py create mode 100644 admin-api-lib/src/admin_api_lib/models/upload_source.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index c1b8afe..1b8255a 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -47,29 +47,6 @@ paths: description: Internal server error tags: - admin - /upload_documents: - post: - description: Uploads user selected pdf documents. - operationId: upload_documents_post - requestBody: - content: - application/pdf: - schema: - format: binary - type: string - description: The PDF document to upload. - required: true - responses: - "200": - description: ok - "400": - description: Bad request - "422": - description: If no text has been extracted from the file. - "500": - description: Internal server error - tags: - - admin /delete_document/{identification}: delete: operationId: delete_document @@ -104,22 +81,28 @@ paths: description: Internal server error tags: - admin - /load_confluence: + /upload_source: post: + description: Uploads user selected pdf documents. + operationId: upload_source + requestBody: + content: + application/pdf: + schema: + $ref: '#/components/schemas/upload_source' + description: The PDF document to upload. + required: true responses: "200": - description: Loading from confluence is successful - "423": - description: "if the confluence loader is already processing a request,\ - \ no further requests are possible. The user needs to wait, till the preliminary\ - \ request finished processing." + description: ok + "400": + description: Bad request + "422": + description: If no text has been extracted from the file. "500": - description: Internal Server Error - "501": - description: The confluence loader is not set up + description: Internal server error tags: - admin - summary: Loading confluence to the vector db components: schemas: status: @@ -148,3 +131,39 @@ components: - status title: document_status type: object + upload_source: + description: "" + properties: + file: + description: "" + format: binary + title: file + type: string + type: + description: "" + title: type + type: string + kwargs: + description: "" + items: + $ref: '#/components/schemas/key_value_pair' + title: kwargs + type: array + required: + - type + title: upload_source + type: object + key_value_pair: + description: "" + example: + value: value + key: key + properties: + key: + description: "" + title: Key + value: + description: "" + title: Value + title: MetaInformationPiece + type: object diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 16efc4b..622cd5a 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -2,14 +2,36 @@ # coding: utf-8 +from typing import Dict, List # noqa: F401 import importlib import pkgutil +from admin_api_lib.apis.admin_api_base import BaseAdminApi from fastapi import APIRouter, Path, Request, Response, UploadFile # noqa: F401 import admin_api_lib.impl -from admin_api_lib.apis.admin_api_base import BaseAdminApi + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) + +from admin_api_lib.models.extra_models import TokenModel # noqa: F401 +from pydantic import Field, StrictBytes, StrictStr +from typing import Any, List, Tuple, Union +from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.upload_source import UploadSource router = APIRouter() @@ -43,6 +65,8 @@ async def delete_document( ------- None """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().delete_document(identification) @@ -73,6 +97,8 @@ async def document_reference_id_get( Response The response object containing the document reference details. """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) @@ -94,39 +120,13 @@ async def get_all_documents_status() -> list[DocumentStatus]: list[DocumentStatus] A list containing the status of all documents. """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() @router.post( - "/load_confluence", - responses={ - 200: {"description": "Loading from confluence is successful"}, - 423: { - "description": ( - "if the confluence loader is already processing a request," - "no further requests are possible. The user needs to wait," - "till the preliminary request finished processing." - ) - }, - 500: {"description": "Internal Server Error"}, - 501: {"description": "The confluence loader is not set up"}, - }, - tags=["admin"], - response_model_by_alias=True, -) -async def load_confluence_post() -> None: - """ - Asynchronously loads a Confluence space. - - Returns - ------- - None - """ - return await BaseAdminApi.subclasses[0]().load_confluence_post() - - -@router.post( - "/upload_documents", + "/upload_source", responses={ 200: {"description": "ok"}, 400: {"description": "Bad request"}, @@ -136,22 +136,10 @@ async def load_confluence_post() -> None: tags=["admin"], response_model_by_alias=True, ) -async def upload_documents_post( - body: UploadFile, - request: Request, +async def upload_source( + upload_source: Annotated[UploadSource, Field(description="The source to upload.")] = Body(None, description="The source to upload."), ) -> None: - """ - Asynchronously uploads user-selected source documents. - - Parameters - ---------- - body : UploadFile - The file object containing the source documents to be uploaded. - request : Request - The request object containing metadata about the upload request. - - Returns - ------- - None - """ - return await BaseAdminApi.subclasses[0]().upload_documents_post(body, request) + """Uploads user selected source.""" + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_source(upload_source) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 6d12beb..efeb120 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,13 +1,14 @@ -"""Module for the base AdminApi interface.""" - # coding: utf-8 -# flake8: noqa: D105 -from typing import ClassVar, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 +from pydantic import Field, StrictBytes, StrictStr +from typing import Any, List, Tuple, Union +from typing_extensions import Annotated from fastapi import Request, Response, UploadFile from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.upload_source import UploadSource class BaseAdminApi: @@ -28,7 +29,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, - identification: str, + identification: StrictStr, ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -43,6 +44,7 @@ async def delete_document( None """ + async def document_reference_id_get( self, identification: str, @@ -61,6 +63,7 @@ async def document_reference_id_get( The response object containing the document reference details. """ + async def get_all_documents_status( self, ) -> list[DocumentStatus]: @@ -73,33 +76,9 @@ async def get_all_documents_status( A list containing the status of all documents. """ - async def load_confluence_post( - self, - ) -> None: - """ - Asynchronously loads a Confluence space. - Returns - ------- - None - """ - - async def upload_documents_post( + async def upload_source( self, - body: UploadFile, - request: Request, + upload_source: Annotated[UploadSource, Field(description="The PDF document to upload.")], ) -> None: - """ - Asynchronously uploads user-selected source documents. - - Parameters - ---------- - body : UploadFile - The file object containing the source documents to be uploaded. - request : Request - The request object containing metadata about the upload request. - - Returns - ------- - None - """ + ... diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py index 79a89e3..ae86262 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -3,52 +3,35 @@ # flake8: noqa """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 __version__ = "1.0.0" # import apis into sdk package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi # import ApiClient from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiAttributeError, - ApiException, - ApiKeyError, - ApiTypeError, - ApiValueError, - OpenApiException, -) +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException # import models into sdk package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py index 13a312f..792725e 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -1,6 +1,5 @@ # flake8: noqa # import apis into api package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py index f1fddba..e4a0fa6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -1,36 +1,27 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import warnings +from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt from typing import Any, Dict, List, Optional, Tuple, Union - -from pydantic import Field, StrictFloat, StrictInt, StrictStr, validate_call from typing_extensions import Annotated -from admin_api_lib.extractor_api_client.openapi_client.api_client import ( - ApiClient, - RequestSerialized, -) +from typing import List +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType @@ -46,244 +37,6 @@ def __init__(self, api_client=None) -> None: api_client = ApiClient.get_default() self.api_client = api_client - @validate_call - def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> List[InformationPiece]: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ).data - - @validate_call - def extract_from_confluence_post_with_http_info( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> ApiResponse[List[InformationPiece]]: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ) - - @validate_call - def extract_from_confluence_post_without_preload_content( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> RESTResponseType: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - return response_data.response - - def _extract_from_confluence_post_serialize( - self, - confluence_parameters, - _request_auth, - _content_type, - _headers, - _host_index, - ) -> RequestSerialized: - _host = None - - _collection_formats: Dict[str, str] = {} - - _path_params: Dict[str, str] = {} - _query_params: List[Tuple[str, str]] = [] - _header_params: Dict[str, Optional[str]] = _headers or {} - _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} - _body_params: Optional[bytes] = None - - # process the path parameters - # process the query parameters - # process the header parameters - # process the form parameters - # process the body parameter - if confluence_parameters is not None: - _body_params = confluence_parameters - - # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) - - # set the HTTP header `Content-Type` - if _content_type: - _header_params["Content-Type"] = _content_type - else: - _default_content_type = self.api_client.select_header_content_type(["application/json"]) - if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type - - # authentication setting - _auth_settings: List[str] = [] - - return self.api_client.param_serialize( - method="POST", - resource_path="/extract_from_confluence", - path_params=_path_params, - query_params=_query_params, - header_params=_header_params, - body=_body_params, - post_params=_form_params, - files=_files, - auth_settings=_auth_settings, - collection_formats=_collection_formats, - _host=_host, - _request_auth=_request_auth, - ) @validate_call def extract_from_file_post( @@ -292,7 +45,10 @@ def extract_from_file_post( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -324,28 +80,32 @@ def extract_from_file_post( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ).data + @validate_call def extract_from_file_post_with_http_info( self, @@ -353,7 +113,10 @@ def extract_from_file_post_with_http_info( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -385,28 +148,32 @@ def extract_from_file_post_with_http_info( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ) + @validate_call def extract_from_file_post_without_preload_content( self, @@ -414,7 +181,10 @@ def extract_from_file_post_without_preload_content( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -446,24 +216,28 @@ def extract_from_file_post_without_preload_content( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) return response_data.response + def _extract_from_file_post_serialize( self, extraction_request, @@ -472,15 +246,19 @@ def _extract_from_file_post_serialize( _headers, _host_index, ) -> RequestSerialized: + _host = None - _collection_formats: Dict[str, str] = {} + _collection_formats: Dict[str, str] = { + } _path_params: Dict[str, str] = {} _query_params: List[Tuple[str, str]] = [] _header_params: Dict[str, Optional[str]] = _headers or {} _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _files: Dict[ + str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]] + ] = {} _body_params: Optional[bytes] = None # process the path parameters @@ -491,24 +269,36 @@ def _extract_from_file_post_serialize( if extraction_request is not None: _body_params = extraction_request + # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + if 'Accept' not in _header_params: + _header_params['Accept'] = self.api_client.select_header_accept( + [ + 'application/json' + ] + ) # set the HTTP header `Content-Type` if _content_type: - _header_params["Content-Type"] = _content_type + _header_params['Content-Type'] = _content_type else: - _default_content_type = self.api_client.select_header_content_type(["application/json"]) + _default_content_type = ( + self.api_client.select_header_content_type( + [ + 'application/json' + ] + ) + ) if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type + _header_params['Content-Type'] = _default_content_type # authentication setting - _auth_settings: List[str] = [] + _auth_settings: List[str] = [ + ] return self.api_client.param_serialize( - method="POST", - resource_path="/extract_from_file", + method='POST', + resource_path='/extract', path_params=_path_params, query_params=_query_params, header_params=_header_params, @@ -518,5 +308,7 @@ def _extract_from_file_post_serialize( auth_settings=_auth_settings, collection_formats=_collection_formats, _host=_host, - _request_auth=_request_auth, + _request_auth=_request_auth ) + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py index 911fd0d..befdba6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -1,53 +1,47 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import datetime +from dateutil.parser import parse +from enum import Enum import decimal import json import mimetypes import os import re import tempfile -from enum import Enum -from typing import Dict, List, Optional, Tuple, Union -from urllib.parse import quote -from dateutil.parser import parse +from urllib.parse import quote +from typing import Tuple, Optional, List, Dict, Union from pydantic import SecretStr +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT import admin_api_lib.extractor_api_client.openapi_client.models from admin_api_lib.extractor_api_client.openapi_client import rest -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.api_response import ( - T as ApiResponseT, -) -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, ApiValueError, + ApiException, BadRequestException, + UnauthorizedException, ForbiddenException, NotFoundException, - ServiceException, - UnauthorizedException, + ServiceException ) RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] - class ApiClient: """Generic API client for OpenAPI client library builds. @@ -66,19 +60,25 @@ class ApiClient: PRIMITIVE_TYPES = (float, bool, bytes, str, int) NATIVE_TYPES_MAPPING = { - "int": int, - "long": int, # TODO remove as only py3 is supported? - "float": float, - "str": str, - "bool": bool, - "date": datetime.date, - "datetime": datetime.datetime, - "decimal": decimal.Decimal, - "object": object, + 'int': int, + 'long': int, # TODO remove as only py3 is supported? + 'float': float, + 'str': str, + 'bool': bool, + 'date': datetime.date, + 'datetime': datetime.datetime, + 'decimal': decimal.Decimal, + 'object': object, } _pool = None - def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: + def __init__( + self, + configuration=None, + header_name=None, + header_value=None, + cookie=None + ) -> None: # use default configuration if none is provided if configuration is None: configuration = Configuration.get_default() @@ -90,7 +90,7 @@ def __init__(self, configuration=None, header_name=None, header_value=None, cook self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = "OpenAPI-Generator/1.0.0/python" + self.user_agent = 'OpenAPI-Generator/1.0.0/python' self.client_side_validation = configuration.client_side_validation def __enter__(self): @@ -102,15 +102,16 @@ def __exit__(self, exc_type, exc_value, traceback): @property def user_agent(self): """User agent for this API client""" - return self.default_headers["User-Agent"] + return self.default_headers['User-Agent'] @user_agent.setter def user_agent(self, value): - self.default_headers["User-Agent"] = value + self.default_headers['User-Agent'] = value def set_default_header(self, header_name, header_value): self.default_headers[header_name] = header_value + _default = None @classmethod @@ -146,12 +147,12 @@ def param_serialize( header_params=None, body=None, post_params=None, - files=None, - auth_settings=None, + files=None, auth_settings=None, collection_formats=None, _host=None, - _request_auth=None, + _request_auth=None ) -> RequestSerialized: + """Builds the HTTP request params needed by the request. :param method: Method to call. :param resource_path: Path to method endpoint. @@ -180,30 +181,47 @@ def param_serialize( header_params = header_params or {} header_params.update(self.default_headers) if self.cookie: - header_params["Cookie"] = self.cookie + header_params['Cookie'] = self.cookie if header_params: header_params = self.sanitize_for_serialization(header_params) - header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + header_params = dict( + self.parameters_to_tuples(header_params,collection_formats) + ) # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples(path_params, collection_formats) + path_params = self.parameters_to_tuples( + path_params, + collection_formats + ) for k, v in path_params: # specified safe chars, encode everything - resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) + resource_path = resource_path.replace( + '{%s}' % k, + quote(str(v), safe=config.safe_chars_for_path_param) + ) # post parameters if post_params or files: post_params = post_params if post_params else [] post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples(post_params, collection_formats) + post_params = self.parameters_to_tuples( + post_params, + collection_formats + ) if files: post_params.extend(self.files_parameters(files)) # auth setting self.update_params_for_auth( - header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth + header_params, + query_params, + auth_settings, + resource_path, + method, + body, + request_auth=_request_auth ) # body @@ -220,13 +238,23 @@ def param_serialize( # query parameters if query_params: query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query(query_params, collection_formats) + url_query = self.parameters_to_url_query( + query_params, + collection_formats + ) url += "?" + url_query return method, url, header_params, body, post_params + def call_api( - self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None + self, + method, + url, + header_params=None, + body=None, + post_params=None, + _request_timeout=None ) -> rest.RESTResponse: """Makes the HTTP request (synchronous) :param method: Method to call. @@ -243,12 +271,10 @@ def call_api( try: # perform request and return response response_data = self.rest_client.request( - method, - url, + method, url, headers=header_params, - body=body, - post_params=post_params, - _request_timeout=_request_timeout, + body=body, post_params=post_params, + _request_timeout=_request_timeout ) except ApiException as e: @@ -257,7 +283,9 @@ def call_api( return response_data def response_deserialize( - self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None + self, + response_data: rest.RESTResponse, + response_types_map: Optional[Dict[str, ApiResponseT]]=None ) -> ApiResponse[ApiResponseT]: """Deserializes response into an object. :param response_data: RESTResponse object to be deserialized. @@ -283,7 +311,7 @@ def response_deserialize( return_data = self.__deserialize_file(response_data) elif response_type is not None: match = None - content_type = response_data.getheader("content-type") + content_type = response_data.getheader('content-type') if content_type is not None: match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) encoding = match.group(1) if match else "utf-8" @@ -298,10 +326,10 @@ def response_deserialize( ) return ApiResponse( - status_code=response_data.status, - data=return_data, - headers=response_data.getheaders(), - raw_data=response_data.data, + status_code = response_data.status, + data = return_data, + headers = response_data.getheaders(), + raw_data = response_data.data ) def sanitize_for_serialization(self, obj): @@ -329,9 +357,13 @@ def sanitize_for_serialization(self, obj): elif isinstance(obj, self.PRIMITIVE_TYPES): return obj elif isinstance(obj, list): - return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] + return [ + self.sanitize_for_serialization(sub_obj) for sub_obj in obj + ] elif isinstance(obj, tuple): - return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) + return tuple( + self.sanitize_for_serialization(sub_obj) for sub_obj in obj + ) elif isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() elif isinstance(obj, decimal.Decimal): @@ -345,12 +377,15 @@ def sanitize_for_serialization(self, obj): # and attributes which value is not None. # Convert attribute name to json key in # model definition for request. - if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): + if hasattr(obj, 'to_dict') and callable(getattr(obj, 'to_dict')): obj_dict = obj.to_dict() else: obj_dict = obj.__dict__ - return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} + return { + key: self.sanitize_for_serialization(val) + for key, val in obj_dict.items() + } def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): """Deserializes response into an object. @@ -369,15 +404,18 @@ def deserialize(self, response_text: str, response_type: str, content_type: Opti data = json.loads(response_text) except ValueError: data = response_text - elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): + elif re.match(r'^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)', content_type, re.IGNORECASE): if response_text == "": data = "" else: data = json.loads(response_text) - elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): + elif re.match(r'^text\/[a-z.+-]+\s*(;|$)', content_type, re.IGNORECASE): data = response_text else: - raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) + raise ApiException( + status=0, + reason="Unsupported content type: {0}".format(content_type) + ) return self.__deserialize(data, response_type) @@ -393,17 +431,19 @@ def __deserialize(self, data, klass): return None if isinstance(klass, str): - if klass.startswith("List["): - m = re.match(r"List\[(.*)]", klass) + if klass.startswith('List['): + m = re.match(r'List\[(.*)]', klass) assert m is not None, "Malformed List type definition" sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) for sub_data in data] + return [self.__deserialize(sub_data, sub_kls) + for sub_data in data] - if klass.startswith("Dict["): - m = re.match(r"Dict\[([^,]*), (.*)]", klass) + if klass.startswith('Dict['): + m = re.match(r'Dict\[([^,]*), (.*)]', klass) assert m is not None, "Malformed Dict type definition" sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} + return {k: self.__deserialize(v, sub_kls) + for k, v in data.items()} # convert str to class if klass in self.NATIVE_TYPES_MAPPING: @@ -439,18 +479,19 @@ def parameters_to_tuples(self, params, collection_formats): for k, v in params.items() if isinstance(params, dict) else params: if k in collection_formats: collection_format = collection_formats[k] - if collection_format == "multi": + if collection_format == 'multi': new_params.extend((k, value) for value in v) else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" + if collection_format == 'ssv': + delimiter = ' ' + elif collection_format == 'tsv': + delimiter = '\t' + elif collection_format == 'pipes': + delimiter = '|' else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(str(value) for value in v))) + delimiter = ',' + new_params.append( + (k, delimiter.join(str(value) for value in v))) else: new_params.append((k, v)) return new_params @@ -475,18 +516,20 @@ def parameters_to_url_query(self, params, collection_formats): if k in collection_formats: collection_format = collection_formats[k] - if collection_format == "multi": + if collection_format == 'multi': new_params.extend((k, str(value)) for value in v) else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" + if collection_format == 'ssv': + delimiter = ' ' + elif collection_format == 'tsv': + delimiter = '\t' + elif collection_format == 'pipes': + delimiter = '|' else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(quote(str(value)) for value in v))) + delimiter = ',' + new_params.append( + (k, delimiter.join(quote(str(value)) for value in v)) + ) else: new_params.append((k, quote(str(v)))) @@ -504,7 +547,7 @@ def files_parameters( params = [] for k, v in files.items(): if isinstance(v, str): - with open(v, "rb") as f: + with open(v, 'rb') as f: filename = os.path.basename(f.name) filedata = f.read() elif isinstance(v, bytes): @@ -518,8 +561,13 @@ def files_parameters( continue else: raise ValueError("Unsupported file value") - mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" - params.append(tuple([k, tuple([filename, filedata, mimetype])])) + mimetype = ( + mimetypes.guess_type(filename)[0] + or 'application/octet-stream' + ) + params.append( + tuple([k, tuple([filename, filedata, mimetype])]) + ) return params def select_header_accept(self, accepts: List[str]) -> Optional[str]: @@ -532,7 +580,7 @@ def select_header_accept(self, accepts: List[str]) -> Optional[str]: return None for accept in accepts: - if re.search("json", accept, re.IGNORECASE): + if re.search('json', accept, re.IGNORECASE): return accept return accepts[0] @@ -547,13 +595,20 @@ def select_header_content_type(self, content_types): return None for content_type in content_types: - if re.search("json", content_type, re.IGNORECASE): + if re.search('json', content_type, re.IGNORECASE): return content_type return content_types[0] def update_params_for_auth( - self, headers, queries, auth_settings, resource_path, method, body, request_auth=None + self, + headers, + queries, + auth_settings, + resource_path, + method, + body, + request_auth=None ) -> None: """Updates header and query params based on authentication setting. @@ -571,14 +626,36 @@ def update_params_for_auth( return if request_auth: - self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) + self._apply_auth_params( + headers, + queries, + resource_path, + method, + body, + request_auth + ) else: for auth in auth_settings: auth_setting = self.configuration.auth_settings().get(auth) if auth_setting: - self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) - - def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: + self._apply_auth_params( + headers, + queries, + resource_path, + method, + body, + auth_setting + ) + + def _apply_auth_params( + self, + headers, + queries, + resource_path, + method, + body, + auth_setting + ) -> None: """Updates the request parameters based on a single auth_setting :param headers: Header parameters dict to be updated. @@ -589,15 +666,17 @@ def _apply_auth_params(self, headers, queries, resource_path, method, body, auth The object type is the return value of sanitize_for_serialization(). :param auth_setting: auth settings for the endpoint """ - if auth_setting["in"] == "cookie": - headers["Cookie"] = auth_setting["value"] - elif auth_setting["in"] == "header": - if auth_setting["type"] != "http-signature": - headers[auth_setting["key"]] = auth_setting["value"] - elif auth_setting["in"] == "query": - queries.append((auth_setting["key"], auth_setting["value"])) + if auth_setting['in'] == 'cookie': + headers['Cookie'] = auth_setting['value'] + elif auth_setting['in'] == 'header': + if auth_setting['type'] != 'http-signature': + headers[auth_setting['key']] = auth_setting['value'] + elif auth_setting['in'] == 'query': + queries.append((auth_setting['key'], auth_setting['value'])) else: - raise ApiValueError("Authentication token must be in `query` or `header`") + raise ApiValueError( + 'Authentication token must be in `query` or `header`' + ) def __deserialize_file(self, response): """Deserializes body to file @@ -617,7 +696,10 @@ def __deserialize_file(self, response): content_disposition = response.getheader("Content-Disposition") if content_disposition: - m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) + m = re.search( + r'filename=[\'"]?([^\'"\s]+)[\'"]?', + content_disposition + ) assert m is not None, "Unexpected 'content-disposition' header value" filename = m.group(1) path = os.path.join(os.path.dirname(path), filename) @@ -660,7 +742,10 @@ def __deserialize_date(self, string): except ImportError: return string except ValueError: - raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) + raise rest.ApiException( + status=0, + reason="Failed to parse `{0}` as date object".format(string) + ) def __deserialize_datetime(self, string): """Deserializes string to datetime. @@ -675,7 +760,13 @@ def __deserialize_datetime(self, string): except ImportError: return string except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) + raise rest.ApiException( + status=0, + reason=( + "Failed to parse `{0}` as datetime object" + .format(string) + ) + ) def __deserialize_enum(self, data, klass): """Deserializes primitive type to enum. @@ -687,7 +778,13 @@ def __deserialize_enum(self, data, klass): try: return klass(data) except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) + raise rest.ApiException( + status=0, + reason=( + "Failed to parse `{0}` as `{1}`" + .format(data, klass) + ) + ) def __deserialize_model(self, data, klass): """Deserializes list or dict to model. diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py index ca801da..9bc7c11 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -1,14 +1,11 @@ """API response object.""" from __future__ import annotations - -from typing import Generic, Mapping, Optional, TypeVar - -from pydantic import BaseModel, Field, StrictBytes, StrictInt +from typing import Optional, Generic, Mapping, TypeVar +from pydantic import Field, StrictInt, StrictBytes, BaseModel T = TypeVar("T") - class ApiResponse(BaseModel, Generic[T]): """ API response object @@ -19,4 +16,6 @@ class ApiResponse(BaseModel, Generic[T]): data: T = Field(description="Deserialized data given the data type") raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - model_config = {"arbitrary_types_allowed": True} + model_config = { + "arbitrary_types_allowed": True + } diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py index de102b2..0b76ea2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -1,41 +1,33 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import copy -import http.client as httplib import logging +from logging import FileHandler import multiprocessing import sys -from logging import FileHandler from typing import Optional - import urllib3 +import http.client as httplib + JSON_SCHEMA_VALIDATION_KEYWORDS = { - "multipleOf", - "maximum", - "exclusiveMaximum", - "minimum", - "exclusiveMinimum", - "maxLength", - "minLength", - "pattern", - "maxItems", - "minItems", + 'multipleOf', 'maximum', 'exclusiveMaximum', + 'minimum', 'exclusiveMinimum', 'maxLength', + 'minLength', 'pattern', 'maxItems', 'minItems' } - class Configuration: """This class contains various settings of the API client. @@ -71,25 +63,20 @@ class Configuration: _default = None - def __init__( - self, - host=None, - api_key=None, - api_key_prefix=None, - username=None, - password=None, - access_token=None, - server_index=None, - server_variables=None, - server_operation_index=None, - server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor""" + def __init__(self, host=None, + api_key=None, api_key_prefix=None, + username=None, password=None, + access_token=None, + server_index=None, server_variables=None, + server_operation_index=None, server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor + """ self._base_path = "http://localhost" if host is None else host """Default Base url """ @@ -135,7 +122,7 @@ def __init__( """ self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = "%(asctime)s %(levelname)s %(message)s" + self.logger_format = '%(asctime)s %(levelname)s %(message)s' """Log format """ self.logger_stream_handler = None @@ -190,7 +177,7 @@ def __init__( self.proxy_headers = None """Proxy headers """ - self.safe_chars_for_path_param = "" + self.safe_chars_for_path_param = '' """Safe chars for path_param """ self.retries = retries @@ -216,7 +203,7 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k not in ("logger", "logger_file_handler"): + if k not in ('logger', 'logger_file_handler'): setattr(result, k, copy.deepcopy(v, memo)) # shallow copy of loggers result.logger = copy.copy(self.logger) @@ -376,7 +363,9 @@ def get_basic_auth_token(self): password = "" if self.password is not None: password = self.password - return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") + return urllib3.util.make_headers( + basic_auth=username + ':' + password + ).get('authorization') def auth_settings(self): """Gets Auth Settings dict for api client. @@ -391,13 +380,12 @@ def to_debug_report(self): :return: The report for debugging. """ - return ( - "Python SDK Debug Report:\n" - "OS: {env}\n" - "Python Version: {pyversion}\n" - "Version of the API: 1.0.0\n" - "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) - ) + return "Python SDK Debug Report:\n"\ + "OS: {env}\n"\ + "Python Version: {pyversion}\n"\ + "Version of the API: 1.0.0\n"\ + "SDK Package Version: 1.0.0".\ + format(env=sys.platform, pyversion=sys.version) def get_host_settings(self): """Gets an array of host settings @@ -406,8 +394,8 @@ def get_host_settings(self): """ return [ { - "url": "", - "description": "No description provided", + 'url': "", + 'description': "No description provided", } ] @@ -429,20 +417,22 @@ def get_host_from_settings(self, index, variables=None, servers=None): except IndexError: raise ValueError( "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers)) - ) + "Must be less than {1}".format(index, len(servers))) - url = server["url"] + url = server['url'] # go through variables and replace placeholders - for variable_name, variable in server.get("variables", {}).items(): - used_value = variables.get(variable_name, variable["default_value"]) + for variable_name, variable in server.get('variables', {}).items(): + used_value = variables.get( + variable_name, variable['default_value']) - if "enum_values" in variable and used_value not in variable["enum_values"]: + if 'enum_values' in variable \ + and used_value not in variable['enum_values']: raise ValueError( "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) - ) + "{1}. Must be {2}.".format( + variable_name, variables[variable_name], + variable['enum_values'])) url = url.replace("{" + variable_name + "}", used_value) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py index 877d8be..a5adf00 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -1,28 +1,27 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from typing import Any, Optional - from typing_extensions import Self - class OpenApiException(Exception): """The base exception class for all OpenAPIExceptions""" class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: - """Raises an exception for TypeErrors + def __init__(self, msg, path_to_item=None, valid_classes=None, + key_type=None) -> None: + """ Raises an exception for TypeErrors Args: msg (str): the exception message @@ -103,10 +102,11 @@ def __init__(self, msg, path_to_item=None) -> None: class ApiException(OpenApiException): + def __init__( - self, - status=None, - reason=None, + self, + status=None, + reason=None, http_resp=None, *, body: Optional[str] = None, @@ -125,17 +125,17 @@ def __init__( self.reason = http_resp.reason if self.body is None: try: - self.body = http_resp.data.decode("utf-8") + self.body = http_resp.data.decode('utf-8') except Exception: pass self.headers = http_resp.getheaders() @classmethod def from_response( - cls, - *, - http_resp, - body: Optional[str], + cls, + *, + http_resp, + body: Optional[str], data: Optional[Any], ) -> Self: if http_resp.status == 400: @@ -156,9 +156,11 @@ def from_response( def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\n"\ + "Reason: {1}\n".format(self.status, self.reason) if self.headers: - error_message += "HTTP response headers: {0}\n".format(self.headers) + error_message += "HTTP response headers: {0}\n".format( + self.headers) if self.data or self.body: error_message += "HTTP response body: {0}\n".format(self.data or self.body) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py index 4301aed..022896f 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -2,30 +2,19 @@ # flake8: noqa """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 # import models into model package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py index c659e69..b797b12 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py @@ -1,36 +1,38 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json from enum import Enum - from typing_extensions import Self class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - IMAGE = "IMAGE" - TABLE = "TABLE" - TEXT = "TEXT" + IMAGE = 'IMAGE' + TABLE = 'TABLE' + TEXT = 'TEXT' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py index 393ba17..db65003 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -1,33 +1,36 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set from typing_extensions import Self - class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] model_config = ConfigDict( populate_by_name=True, @@ -35,13 +38,15 @@ class ExtractionRequest(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -58,13 +63,21 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, exclude=excluded_fields, exclude_none=True, ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item_kwargs in self.kwargs: + if _item_kwargs: + _items.append(_item_kwargs.to_dict()) + _dict['kwargs'] = _items return _dict @classmethod @@ -76,5 +89,11 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index a6d6c08..95a0fdb 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -1,40 +1,33 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set from typing_extensions import Self -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) - - class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 - + """ # noqa: E501 metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -46,13 +39,15 @@ class InformationPiece(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -69,7 +64,8 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, @@ -82,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_metadata in self.metadata: if _item_metadata: _items.append(_item_metadata.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -94,15 +90,11 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py index 80629a9..553288b 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py @@ -1,31 +1,31 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from typing import Optional, Set from typing_extensions import Self - class KeyValuePair(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -36,13 +36,15 @@ class KeyValuePair(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -59,7 +61,8 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, @@ -69,12 +72,12 @@ def to_dict(self) -> Dict[str, Any]: # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None + _dict['key'] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None + _dict['value'] = None return _dict @@ -87,5 +90,10 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py index 09f1e39..32b1c3a 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -1,14 +1,14 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 @@ -19,10 +19,7 @@ import urllib3 -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, - ApiValueError, -) +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} RESTResponseType = urllib3.HTTPResponse @@ -39,6 +36,7 @@ def is_socks_proxy_url(url): class RESTResponse(io.IOBase): + def __init__(self, resp) -> None: self.response = resp self.status = resp.status @@ -60,6 +58,7 @@ def getheader(self, name, default=None): class RESTClientObject: + def __init__(self, configuration) -> None: # urllib3.PoolManager will pass all kw parameters to connectionpool # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 @@ -79,19 +78,22 @@ def __init__(self, configuration) -> None: "key_file": configuration.key_file, } if configuration.assert_hostname is not None: - pool_args["assert_hostname"] = configuration.assert_hostname + pool_args['assert_hostname'] = ( + configuration.assert_hostname + ) if configuration.retries is not None: - pool_args["retries"] = configuration.retries + pool_args['retries'] = configuration.retries if configuration.tls_server_name: - pool_args["server_hostname"] = configuration.tls_server_name + pool_args['server_hostname'] = configuration.tls_server_name + if configuration.socket_options is not None: - pool_args["socket_options"] = configuration.socket_options + pool_args['socket_options'] = configuration.socket_options if configuration.connection_pool_maxsize is not None: - pool_args["maxsize"] = configuration.connection_pool_maxsize + pool_args['maxsize'] = configuration.connection_pool_maxsize # https pool manager self.pool_manager: urllib3.PoolManager @@ -99,7 +101,6 @@ def __init__(self, configuration) -> None: if configuration.proxy: if is_socks_proxy_url(configuration.proxy): from urllib3.contrib.socks import SOCKSProxyManager - pool_args["proxy_url"] = configuration.proxy pool_args["headers"] = configuration.proxy_headers self.pool_manager = SOCKSProxyManager(**pool_args) @@ -110,7 +111,15 @@ def __init__(self, configuration) -> None: else: self.pool_manager = urllib3.PoolManager(**pool_args) - def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): + def request( + self, + method, + url, + headers=None, + body=None, + post_params=None, + _request_timeout=None + ): """Perform requests. :param method: http request method @@ -126,10 +135,20 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque (connection, read) timeouts. """ method = method.upper() - assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] + assert method in [ + 'GET', + 'HEAD', + 'DELETE', + 'POST', + 'PUT', + 'PATCH', + 'OPTIONS' + ] if post_params and body: - raise ApiValueError("body parameter cannot be used with post_params parameter.") + raise ApiValueError( + "body parameter cannot be used with post_params parameter." + ) post_params = post_params or {} headers = headers or {} @@ -138,22 +157,37 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque if _request_timeout: if isinstance(_request_timeout, (int, float)): timeout = urllib3.Timeout(total=_request_timeout) - elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: - timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) + elif ( + isinstance(_request_timeout, tuple) + and len(_request_timeout) == 2 + ): + timeout = urllib3.Timeout( + connect=_request_timeout[0], + read=_request_timeout[1] + ) try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: + if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: + # no content type provided or payload is json - content_type = headers.get("Content-Type") - if not content_type or re.search("json", content_type, re.IGNORECASE): + content_type = headers.get('Content-Type') + if ( + not content_type + or re.search('json', content_type, re.IGNORECASE) + ): request_body = None if body is not None: request_body = json.dumps(body) r = self.pool_manager.request( - method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False + method, + url, + body=request_body, + timeout=timeout, + headers=headers, + preload_content=False ) - elif content_type == "application/x-www-form-urlencoded": + elif content_type == 'application/x-www-form-urlencoded': r = self.pool_manager.request( method, url, @@ -161,15 +195,15 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque encode_multipart=False, timeout=timeout, headers=headers, - preload_content=False, + preload_content=False ) - elif content_type == "multipart/form-data": + elif content_type == 'multipart/form-data': # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. - del headers["Content-Type"] + del headers['Content-Type'] # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a,b) for a, b in post_params] r = self.pool_manager.request( method, url, @@ -177,20 +211,29 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque encode_multipart=True, timeout=timeout, headers=headers, - preload_content=False, + preload_content=False ) # Pass a `string` parameter directly in the body to support # other content types than JSON when `body` argument is # provided in serialized form. elif isinstance(body, str) or isinstance(body, bytes): r = self.pool_manager.request( - method, url, body=body, timeout=timeout, headers=headers, preload_content=False + method, + url, + body=body, + timeout=timeout, + headers=headers, + preload_content=False ) - elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): + elif headers['Content-Type'].startswith('text/') and isinstance(body, bool): request_body = "true" if body else "false" r = self.pool_manager.request( - method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers - ) + method, + url, + body=request_body, + preload_content=False, + timeout=timeout, + headers=headers) else: # Cannot generate the request from given parameters msg = """Cannot prepare a request message for provided @@ -200,7 +243,12 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque # For `GET`, `HEAD` else: r = self.pool_manager.request( - method, url, fields={}, timeout=timeout, headers=headers, preload_content=False + method, + url, + fields={}, + timeout=timeout, + headers=headers, + preload_content=False ) except urllib3.exceptions.SSLError as e: msg = "\n".join([type(e).__name__, str(e)]) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py new file mode 100644 index 0000000..9704fc8 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -0,0 +1,33 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + +class TestContentType(unittest.TestCase): + """ContentType unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testContentType(self): + """Test ContentType""" + # inst = ContentType() + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py new file mode 100644 index 0000000..fd48e16 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + +class TestExtractionRequest(unittest.TestCase): + """ExtractionRequest unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionRequest: + """Test ExtractionRequest + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `ExtractionRequest` + """ + model = ExtractionRequest() + if include_optional: + return ExtractionRequest( + file = bytes(b'blah'), + type = '', + kwargs = [ + {"value":"value","key":"key"} + ] + ) + else: + return ExtractionRequest( + type = '', + ) + """ + + def testExtractionRequest(self): + """Test ExtractionRequest""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py new file mode 100644 index 0000000..e76b68d --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -0,0 +1,37 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + + +class TestExtractorApi(unittest.TestCase): + """ExtractorApi unit test stubs""" + + def setUp(self) -> None: + self.api = ExtractorApi() + + def tearDown(self) -> None: + pass + + def test_extract_from_file_post(self) -> None: + """Test case for extract_from_file_post + + """ + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py new file mode 100644 index 0000000..0661af0 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -0,0 +1,60 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +class TestInformationPiece(unittest.TestCase): + """InformationPiece unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> InformationPiece: + """Test InformationPiece + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `InformationPiece` + """ + model = InformationPiece() + if include_optional: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE' + ) + else: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE', + ) + """ + + def testInformationPiece(self): + """Test InformationPiece""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py new file mode 100644 index 0000000..695ebb9 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -0,0 +1,52 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + +class TestKeyValuePair(unittest.TestCase): + """KeyValuePair unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> KeyValuePair: + """Test KeyValuePair + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `KeyValuePair` + """ + model = KeyValuePair() + if include_optional: + return KeyValuePair( + key = None, + value = None + ) + else: + return KeyValuePair( + ) + """ + + def testKeyValuePair(self): + """Test KeyValuePair""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index e379f85..d00dfce 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -admin-api-lib + admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. + The API is used for the communication between the admin frontend and the admin backend in the rag project. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from admin_api_lib.models.status import Status + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.models.status import Status try: from typing import Self except ImportError: from typing_extensions import Self - class DocumentStatus(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 name: StrictStr status: Status __properties: ClassVar[List[str]] = ["name", "status"] @@ -42,13 +42,15 @@ class DocumentStatus(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +69,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"name": obj.get("name"), "status": obj.get("status")}) + _obj = cls.model_validate({ + "name": obj.get("name"), + "status": obj.get("status") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/models/extra_models.py b/admin-api-lib/src/admin_api_lib/models/extra_models.py index f0588d2..a3a283f 100644 --- a/admin-api-lib/src/admin_api_lib/models/extra_models.py +++ b/admin-api-lib/src/admin_api_lib/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py new file mode 100644 index 0000000..8419cfa --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" + admin-api-lib + + The API is used for the communication between the admin frontend and the admin backend in the rag project. + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + + + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +class KeyValuePair(BaseModel): + """ + + """ # noqa: E501 + key: Optional[Any] = None + value: Optional[Any] = None + __properties: ClassVar[List[str]] = ["key", "value"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of KeyValuePair from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={ + }, + exclude_none=True, + ) + # set to None if key (nullable) is None + # and model_fields_set contains the field + if self.key is None and "key" in self.model_fields_set: + _dict['key'] = None + + # set to None if value (nullable) is None + # and model_fields_set contains the field + if self.value is None and "value" in self.model_fields_set: + _dict['value'] = None + + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of KeyValuePair from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) + return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 33f8f58..2e0de2c 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -admin-api-lib + admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. + The API is used for the communication between the admin frontend and the admin backend in the rag project. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,17 +27,21 @@ class Status(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - UPLOADING = "UPLOADING" - PROCESSING = "PROCESSING" - READY = "READY" - ERROR = "ERROR" + UPLOADING = 'UPLOADING' + PROCESSING = 'PROCESSING' + READY = 'READY' + ERROR = 'ERROR' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of Status from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py new file mode 100644 index 0000000..f76b987 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" + admin-api-lib + + The API is used for the communication between the admin frontend and the admin backend in the rag project. + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + + + +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from admin_api_lib.models.key_value_pair import KeyValuePair +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +class UploadSource(BaseModel): + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of UploadSource from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={ + }, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item in self.kwargs: + if _item: + _items.append(_item.to_dict()) + _dict['kwargs'] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of UploadSource from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None + }) + return _obj + + diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index a6aea27..d949eb7 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -5,7 +5,7 @@ info: servers: - url: / paths: - /extract_from_file: + /extract: post: operationId: extract_from_file_post requestBody: @@ -29,47 +29,8 @@ paths: description: Something somewhere went terribly wrong. tags: - extractor - /extract_from_confluence: - post: - operationId: extract_from_confluence_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/confluence_parameters' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: ok - "404": - description: not found - "422": - description: unprocessable entity - "500": - description: internal server error - tags: - - extractor components: schemas: - extraction_request: - description: "" - example: - path_on_s3: path on s3 - properties: - path_on_s3: - description: "" - title: PathOnS3 - type: string - required: - - path_on_s3 - title: ExtractionRequest - type: object key_value_pair: description: "" example: @@ -120,54 +81,25 @@ components: - type title: InformationPiece type: object - confluence_parameters: + extraction_request: description: "" properties: - url: - description: url of the confluence space. - title: url - type: string - token: - description: api key to access confluence. - title: token - type: string - space_key: - description: the space key of the confluence pages. - title: space_key + file: + description: "" + format: binary + title: file type: string - include_attachments: - default: false - description: "whether to include file attachments (e.g., images, documents)\ - \ in the parsed content. Default is `false`." - title: include_attachments - type: boolean - keep_markdown_format: - default: true - description: whether to preserve markdown formatting in the output. Default - is `true`. - title: keep_markdown_format - type: boolean - keep_newlines: - default: true - description: whether to retain newline characters in the output for better - readability. Default is `true`. - title: keep_newlines - type: boolean - document_name: - description: The name that will be used to store the confluence db in the - key value db and the vectordatabase (metadata.document). - title: document_name + type: + description: "" + title: type type: string - confluence_kwargs: - description: Additional kwargs like verify_ssl + kwargs: + description: "" items: $ref: '#/components/schemas/key_value_pair' - title: confluence_kwargs + title: kwargs type: array required: - - document_name - - space_key - - token - - url - title: confluence_parameters + - type + title: extraction_request type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 418a666..6246635 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,59 +1,42 @@ -"""Module for the Extractor API.""" - # coding: utf-8 -# noqa: D105 +from typing import Dict, List # noqa: F401 import importlib import pkgutil -from typing import List # noqa: F401 - -from fastapi import APIRouter, Body # noqa: F401 -import extractor_api_lib.impl from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +import openapi_server.impl + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) + +from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 +from typing import Any, List from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece + router = APIRouter() -ns_pkg = extractor_api_lib.impl +ns_pkg = openapi_server.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @router.post( - "/extract_from_confluence", - responses={ - 200: {"model": List[InformationPiece], "description": "ok"}, - 404: {"description": "not found"}, - 422: {"description": "unprocessable entity"}, - 500: {"description": "internal server error"}, - }, - tags=["extractor"], - response_model_by_alias=True, -) -async def extract_from_confluence_post( - confluence_parameters: ConfluenceParameters = Body(None, description=""), -) -> List[InformationPiece]: - """ - Extract information from a Confluence space. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces from the Confluence space. - """ - return await BaseExtractorApi.subclasses[0]().extract_from_confluence_post(confluence_parameters) - - -@router.post( - "/extract_from_file", + "/extract", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, 422: {"description": "Body is not a valid PDF."}, @@ -65,17 +48,6 @@ async def extract_from_confluence_post( async def extract_from_file_post( extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: - """ - Extract information from a file based on the provided extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request object containing details about the extraction process. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces. - """ + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index 8f03f9c..a0b1fb5 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -1,63 +1,20 @@ -"""Module for the base ExtractorApi interface.""" - # coding: utf-8 -# flake8: noqa: D105 -from typing import ClassVar, List, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from typing import Any, List from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece class BaseExtractorApi: - """ - The base ExtractorApi interface. - - Attributes - ---------- - subclasses : ClassVar[Tuple] - A tuple containing all subclasses of BaseExtractorApi. - """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - - async def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - ) -> List[InformationPiece]: - """ - Extract information from a Confluence space. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces from the Confluence space. - """ - async def extract_from_file_post( self, extraction_request: ExtractionRequest, ) -> List[InformationPiece]: - """ - Extract information from a file based on the provided extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request object containing details about the extraction process. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces. - """ + ... diff --git a/extractor-api-lib/src/extractor_api_lib/models/content_type.py b/extractor-api-lib/src/extractor_api_lib/models/content_type.py index 4e362d3..195f424 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/content_type.py +++ b/extractor-api-lib/src/extractor_api_lib/models/content_type.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,16 +27,20 @@ class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - IMAGE = "IMAGE" - TABLE = "TABLE" - TEXT = "TEXT" + IMAGE = 'IMAGE' + TABLE = 'TABLE' + TEXT = 'TEXT' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py index f0588d2..a3a283f 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 3290aa7..437442f 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -1,37 +1,41 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + + -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from extractor_api_lib.models.key_value_pair import KeyValuePair try: from typing import Self except ImportError: from typing_extensions import Self - class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] model_config = { "populate_by_name": True, @@ -39,13 +43,15 @@ class ExtractionRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -64,9 +70,17 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item in self.kwargs: + if _item: + _items.append(_item.to_dict()) + _dict['kwargs'] = _items return _dict @classmethod @@ -78,5 +92,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None + }) return _obj + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 440f7a3..98261ff 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -1,40 +1,38 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + -from pydantic import BaseModel, ConfigDict, StrictStr + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List from extractor_api_lib.models.content_type import ContentType from extractor_api_lib.models.key_value_pair import KeyValuePair - try: from typing import Self except ImportError: from typing_extensions import Self - class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 - + """ # noqa: E501 metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -46,13 +44,15 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -71,7 +71,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -80,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -92,15 +93,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index bdc5bb2..0cf865e 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -1,35 +1,36 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json + + -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional try: from typing import Self except ImportError: from typing_extensions import Self - class KeyValuePair(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -40,13 +41,15 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -65,18 +68,19 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None + _dict['key'] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None + _dict['value'] = None return _dict @@ -89,5 +93,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index dda92db..425f48c 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,16 +3,16 @@ # coding: utf-8 # flake8: noqa: D105 +from typing import Dict, List # noqa: F401 import importlib -import logging import pkgutil -from asyncio import FIRST_COMPLETED, CancelledError, create_task, sleep, wait -from contextlib import suppress -from typing import Any, Awaitable, List # noqa: F401 + +from rag_core_api.apis.rag_api_base import BaseRagApi +import openapi_server.impl from fastapi import ( # noqa: F401 APIRouter, - BackgroundTasks, + BackgroundTasks, Body, Cookie, Depends, @@ -21,7 +21,7 @@ HTTPException, Path, Query, - Request, + Request, Response, Security, status, @@ -29,11 +29,16 @@ import rag_core_api.impl from rag_core_api.apis.rag_api_base import BaseRagApi +from rag_core_api.models.extra_models import TokenModel # noqa: F401 +from pydantic import Field, StrictStr +from typing import Any, List +from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest from rag_core_api.models.information_piece import InformationPiece + logger = logging.getLogger(__name__) router = APIRouter() @@ -52,7 +57,6 @@ async def _disconnected(request: Request) -> None: except CancelledError: break - @router.post( "/chat/{session_id}", responses={ @@ -64,8 +68,8 @@ async def _disconnected(request: Request) -> None: ) async def chat( request: Request, - session_id: str = Path(..., description=""), - chat_request: ChatRequest = Body(None, description="Chat with RAG."), + session_id: StrictStr = Path(..., description=""), + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body(None, description="Chat with RAG."), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -121,7 +125,8 @@ async def chat( tags=["rag"], response_model_by_alias=True, ) -async def evaluate() -> None: +async def evaluate( +) -> None: """ Asynchronously evaluate the RAG. @@ -129,6 +134,8 @@ async def evaluate() -> None: ------- None """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().evaluate() @@ -160,7 +167,9 @@ async def remove_information_piece( Returns ------- None - """ + """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -191,5 +200,7 @@ async def upload_information_piece( Returns ------- None - """ + """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 615230d..70d1406 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -2,9 +2,11 @@ # coding: utf-8 # flake8: noqa: D105 - from typing import ClassVar, Dict, List, Tuple # noqa: F401 +from pydantic import Field, StrictStr +from typing import Any, List +from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest @@ -22,17 +24,15 @@ class BaseRagApi: subclasses : ClassVar[Tuple] A tuple that holds all subclasses of BaseRagApi. """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseRagApi.subclasses = BaseRagApi.subclasses + (cls,) - async def chat( self, - session_id: str, - chat_request: ChatRequest, + session_id: StrictStr, + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")], ) -> ChatResponse: """ Asynchronously handles the chat endpoint for the RAG API. @@ -52,6 +52,7 @@ async def chat( The chat response if the chat task completes successfully, otherwise None. """ + async def evaluate( self, ) -> None: @@ -63,6 +64,7 @@ async def evaluate( None """ + async def remove_information_piece( self, delete_request: DeleteRequest, @@ -82,6 +84,7 @@ async def remove_information_piece( None """ + async def upload_information_piece( self, information_piece: List[InformationPiece], diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 5980dca..71e2e8c 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict -from rag_core_api.models.chat_history_message import ChatHistoryMessage + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.chat_history_message import ChatHistoryMessage try: from typing import Self except ImportError: from typing_extensions import Self - class ChatHistory(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 messages: List[ChatHistoryMessage] __properties: ClassVar[List[str]] = ["messages"] @@ -41,13 +41,15 @@ class ChatHistory(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -66,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in messages (list) @@ -75,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.messages: if _item: _items.append(_item.to_dict()) - _dict["messages"] = _items + _dict['messages'] = _items return _dict @classmethod @@ -87,13 +90,9 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "messages": ( - [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] - if obj.get("messages") is not None - else None - ) - } - ) + _obj = cls.model_validate({ + "messages": [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] if obj.get("messages") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index c664092..59da140 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from rag_core_api.models.chat_role import ChatRole + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.chat_role import ChatRole try: from typing import Self except ImportError: from typing_extensions import Self - class ChatHistoryMessage(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 role: ChatRole message: StrictStr __properties: ClassVar[List[str]] = ["role", "message"] @@ -42,13 +42,15 @@ class ChatHistoryMessage(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +69,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"role": obj.get("role"), "message": obj.get("message")}) + _obj = cls.model_validate({ + "role": obj.get("role"), + "message": obj.get("message") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 1e0b135..9e28631 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from rag_core_api.models.chat_history import ChatHistory + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from rag_core_api.models.chat_history import ChatHistory try: from typing import Self except ImportError: from typing_extensions import Self - class ChatRequest(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 history: Optional[ChatHistory] = None message: StrictStr __properties: ClassVar[List[str]] = ["history", "message"] @@ -42,13 +42,15 @@ class ChatRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,12 +69,13 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of history if self.history: - _dict["history"] = self.history.to_dict() + _dict['history'] = self.history.to_dict() return _dict @classmethod @@ -84,10 +87,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "history": (ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None), - "message": obj.get("message"), - } - ) + _obj = cls.model_validate({ + "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "message": obj.get("message") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index a0fcf44..6a8daad 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, Field, StrictStr -from rag_core_api.models.information_piece import InformationPiece + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.information_piece import InformationPiece try: from typing import Self except ImportError: from typing_extensions import Self - class ChatResponse(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 answer: StrictStr finish_reason: StrictStr = Field(description=" ") citations: List[InformationPiece] @@ -43,13 +43,15 @@ class ChatResponse(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -68,7 +70,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in citations (list) @@ -77,7 +80,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.citations: if _item: _items.append(_item.to_dict()) - _dict["citations"] = _items + _dict['citations'] = _items return _dict @classmethod @@ -89,15 +92,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "answer": obj.get("answer"), - "finish_reason": obj.get("finish_reason"), - "citations": ( - [SourceDocument.from_dict(_item) for _item in obj.get("citations")] - if obj.get("citations") is not None - else None - ), - } - ) + _obj = cls.model_validate({ + "answer": obj.get("answer"), + "finish_reason": obj.get("finish_reason"), + "citations": [InformationPiece.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index cd2ff17..d0bef70 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,15 +27,19 @@ class ChatRole(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - USER = "user" - ASSISTANT = "assistant" + USER = 'user' + ASSISTANT = 'assistant' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ChatRole from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index 3d39928..df72d7d 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,17 +27,21 @@ class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - TEXT = "TEXT" - IMAGE = "IMAGE" - TABLE = "TABLE" - SUMMARY = "SUMMARY" + TEXT = 'TEXT' + IMAGE = 'IMAGE' + TABLE = 'TABLE' + SUMMARY = 'SUMMARY' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 797dcf2..2c3592c 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json -from pydantic import BaseModel, ConfigDict -from rag_core_api.models.key_value_pair import KeyValuePair + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from rag_core_api.models.key_value_pair import KeyValuePair try: from typing import Self except ImportError: from typing_extensions import Self - class DeleteRequest(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 metadata: Optional[List[KeyValuePair]] = None __properties: ClassVar[List[str]] = ["metadata"] @@ -41,13 +41,15 @@ class DeleteRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -66,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -75,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -87,13 +90,9 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ) - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/extra_models.py b/rag-core-api/src/rag_core_api/models/extra_models.py index f0588d2..a3a283f 100644 --- a/rag-core-api/src/rag_core_api/models/extra_models.py +++ b/rag-core-api/src/rag_core_api/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index b85092f..28d5115 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -1,43 +1,39 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + -from pydantic import BaseModel, ConfigDict, Field, StrictStr + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair - try: from typing import Self except ImportError: from typing_extensions import Self - class InformationPiece(BaseModel): """ Uploading a json with chunks and metadata. - """ # noqa: E501 - - metadata: List[KeyValuePair] = Field( - description="The metadata of the documents that are stored in the vectordatabase." - ) + """ # noqa: E501 + metadata: List[KeyValuePair] = Field(description="The metadata of the documents that are stored in the vectordatabase.") page_content: StrictStr = Field(description="The content of the document") type: ContentType __properties: ClassVar[List[str]] = ["metadata", "page_content", "type"] @@ -48,13 +44,15 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -73,7 +71,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -82,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -94,15 +93,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index abf0986..b9654c3 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -1,37 +1,36 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + + -from pydantic import BaseModel, ConfigDict, Field, StrictStr +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self except ImportError: from typing_extensions import Self - class KeyValuePair(BaseModel): """ The key value pair. - """ # noqa: E501 - + """ # noqa: E501 key: StrictStr value: StrictStr = Field(description=" ") __properties: ClassVar[List[str]] = ["key", "value"] @@ -42,13 +41,15 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +83,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + From cae32ed28faa338ae9d9917e95144dfda5e787cf Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 14:46:02 +0200 Subject: [PATCH 02/43] api change --- admin-api-lib/openapi.yaml | 14 +- .../api_endpoints/source_uploader.py | 14 + .../src/admin_api_lib/apis/admin_api.py | 22 +- .../src/admin_api_lib/apis/admin_api_base.py | 24 +- .../openapi_client/__init__.py | 11 +- .../openapi_client/api/__init__.py | 1 - .../openapi_client/api/extractor_api.py | 211 +++++++------ .../openapi_client/api_client.py | 292 ++++++------------ .../openapi_client/api_response.py | 5 +- .../openapi_client/configuration.py | 102 +++--- .../openapi_client/exceptions.py | 38 ++- .../openapi_client/models/__init__.py | 11 +- .../openapi_client/models/content_type.py | 22 +- .../models/extraction_request.py | 40 +-- .../models/information_piece.py | 38 ++- .../openapi_client/models/key_value_pair.py | 31 +- .../openapi_client/rest.py | 110 ++----- .../openapi_client/test/test_content_type.py | 14 +- .../test/test_extraction_request.py | 20 +- .../openapi_client/test/test_extractor_api.py | 16 +- .../test/test_information_piece.py | 20 +- .../test/test_key_value_pair.py | 20 +- .../admin_api_lib/models/document_status.py | 30 +- .../src/admin_api_lib/models/extra_models.py | 1 + .../admin_api_lib/models/key_value_pair.py | 34 +- .../src/admin_api_lib/models/status.py | 25 +- .../src/admin_api_lib/models/upload_source.py | 43 +-- extractor-api-lib/openapi.yaml | 14 +- .../extractor_api_lib/apis/extractor_api.py | 17 +- .../apis/extractor_api_base.py | 17 +- .../extractor_api_lib/models/content_type.py | 23 +- .../extractor_api_lib/models/extra_models.py | 1 + .../models/extraction_request.py | 43 +-- .../models/information_piece.py | 41 +-- .../models/key_value_pair.py | 34 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 16 +- .../src/rag_core_api/apis/rag_api_base.py | 5 +- .../src/rag_core_api/models/chat_history.py | 39 +-- .../models/chat_history_message.py | 30 +- .../src/rag_core_api/models/chat_request.py | 37 +-- .../src/rag_core_api/models/chat_response.py | 43 +-- .../src/rag_core_api/models/chat_role.py | 21 +- .../src/rag_core_api/models/content_type.py | 25 +- .../src/rag_core_api/models/delete_request.py | 39 +-- .../src/rag_core_api/models/extra_models.py | 1 + .../rag_core_api/models/information_piece.py | 45 +-- .../src/rag_core_api/models/key_value_pair.py | 28 +- 47 files changed, 782 insertions(+), 946 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 1b8255a..efbb2f6 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -83,14 +83,14 @@ paths: - admin /upload_source: post: - description: Uploads user selected pdf documents. + description: Uploads user selected sources. operationId: upload_source requestBody: content: - application/pdf: + multipart/form-data: schema: $ref: '#/components/schemas/upload_source' - description: The PDF document to upload. + description: The source to upload. required: true responses: "200": @@ -137,21 +137,21 @@ components: file: description: "" format: binary - title: file type: string type: description: "" - title: type type: string kwargs: description: "" items: $ref: '#/components/schemas/key_value_pair' - title: kwargs type: array + name: + description: "" + type: string required: + - name - type - title: upload_source type: object key_value_pair: description: "" diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py new file mode 100644 index 0000000..2cfbf2f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -0,0 +1,14 @@ +from dataclasses import Field +from typing_extensions import Annotated +from abc import ABC, abstractmethod + +from admin_api_lib.models.upload_source import UploadSource + + +class SourceUploader(ABC): + + @abstractmethod + async def upload_source( + self, + upload_source: Annotated[UploadSource, Field(description="The source to upload.")], + ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 622cd5a..81d55f5 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,5 +1,3 @@ -"""Module for the Admin API.""" - # coding: utf-8 from typing import Dict, List # noqa: F401 @@ -28,10 +26,10 @@ from admin_api_lib.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union +from typing import Any, List, Optional, Tuple, Union from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.upload_source import UploadSource +from admin_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() @@ -101,17 +99,16 @@ async def document_reference_id_get( raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) - @router.get( "/all_documents_status", responses={ - 200: {"model": list[DocumentStatus], "description": "list of document links"}, + 200: {"model": List[DocumentStatus], "description": "List of document links"}, 500: {"description": "Internal server error"}, }, tags=["admin"], response_model_by_alias=True, ) -async def get_all_documents_status() -> list[DocumentStatus]: +async def get_all_documents_status() -> List[DocumentStatus]: """ Asynchronously retrieves the status of all documents. @@ -119,7 +116,7 @@ async def get_all_documents_status() -> list[DocumentStatus]: ------- list[DocumentStatus] A list containing the status of all documents. - """ + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() @@ -137,9 +134,12 @@ async def get_all_documents_status() -> list[DocumentStatus]: response_model_by_alias=True, ) async def upload_source( - upload_source: Annotated[UploadSource, Field(description="The source to upload.")] = Body(None, description="The source to upload."), + type: StrictStr = Form(None, description=""), + name: StrictStr = Form(None, description=""), + file: Optional[UploadFile] = Form(None, description=""), + kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), ) -> None: - """Uploads user selected source.""" + """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(upload_source) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index efeb120..34bce77 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -3,24 +3,14 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union +from typing import Any, List, Optional, Tuple, Union from typing_extensions import Annotated from fastapi import Request, Response, UploadFile - from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.upload_source import UploadSource +from admin_api_lib.models.key_value_pair import KeyValuePair class BaseAdminApi: - """ - The base AdminApi interface. - - Attributes - ---------- - subclasses : ClassVar[Tuple] - A tuple that holds all subclasses of BaseAdminApi. - """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -44,10 +34,9 @@ async def delete_document( None """ - async def document_reference_id_get( self, - identification: str, + identification: Annotated[StrictStr, Field(description="Identifier of the pdf document.")], ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -76,9 +65,12 @@ async def get_all_documents_status( A list containing the status of all documents. """ - async def upload_source( self, - upload_source: Annotated[UploadSource, Field(description="The PDF document to upload.")], + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], ) -> None: + """Uploads user selected sources.""" ... diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py index ae86262..f43e4e9 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -3,14 +3,14 @@ # flake8: noqa """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -32,6 +32,5 @@ # import models into sdk package from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py index 792725e..c95ce65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -2,4 +2,3 @@ # import apis into api package from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py index e4a0fa6..1a862d3 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 import warnings @@ -16,9 +16,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union from typing_extensions import Annotated -from typing import List -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from typing import List, Optional, Tuple, Union from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse @@ -37,29 +38,34 @@ def __init__(self, api_client=None) -> None: api_client = ApiClient.get_default() self.api_client = api_client - @validate_call - def extract_from_file_post( + def extract( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> List[InformationPiece]: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -80,54 +86,59 @@ def extract_from_file_post( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ).data - @validate_call - def extract_from_file_post_with_http_info( + def extract_with_http_info( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> ApiResponse[List[InformationPiece]]: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -148,54 +159,59 @@ def extract_from_file_post_with_http_info( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ) - @validate_call - def extract_from_file_post_without_preload_content( + def extract_without_preload_content( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> RESTResponseType: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -216,31 +232,33 @@ def extract_from_file_post_without_preload_content( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) return response_data.response - - def _extract_from_file_post_serialize( + def _extract_serialize( self, - extraction_request, + type, + name, + file, + kwargs, _request_auth, _content_type, _headers, @@ -250,55 +268,48 @@ def _extract_from_file_post_serialize( _host = None _collection_formats: Dict[str, str] = { + "kwargs": "csv", } _path_params: Dict[str, str] = {} _query_params: List[Tuple[str, str]] = [] _header_params: Dict[str, Optional[str]] = _headers or {} _form_params: List[Tuple[str, str]] = [] - _files: Dict[ - str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]] - ] = {} + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} _body_params: Optional[bytes] = None # process the path parameters # process the query parameters # process the header parameters # process the form parameters + if file is not None: + _files["file"] = file + if type is not None: + _form_params.append(("type", type)) + if kwargs is not None: + _form_params.append(("kwargs", kwargs)) + if name is not None: + _form_params.append(("name", name)) # process the body parameter - if extraction_request is not None: - _body_params = extraction_request - # set the HTTP header `Accept` - if 'Accept' not in _header_params: - _header_params['Accept'] = self.api_client.select_header_accept( - [ - 'application/json' - ] - ) + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) # set the HTTP header `Content-Type` if _content_type: - _header_params['Content-Type'] = _content_type + _header_params["Content-Type"] = _content_type else: - _default_content_type = ( - self.api_client.select_header_content_type( - [ - 'application/json' - ] - ) - ) + _default_content_type = self.api_client.select_header_content_type(["multipart/form-data"]) if _default_content_type is not None: - _header_params['Content-Type'] = _default_content_type + _header_params["Content-Type"] = _default_content_type # authentication setting - _auth_settings: List[str] = [ - ] + _auth_settings: List[str] = [] return self.api_client.param_serialize( - method='POST', - resource_path='/extract', + method="POST", + resource_path="/extract", path_params=_path_params, query_params=_query_params, header_params=_header_params, @@ -308,7 +319,5 @@ def _extract_from_file_post_serialize( auth_settings=_auth_settings, collection_formats=_collection_formats, _host=_host, - _request_auth=_request_auth + _request_auth=_request_auth, ) - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py index befdba6..ba8f5d2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -37,11 +37,12 @@ UnauthorizedException, ForbiddenException, NotFoundException, - ServiceException + ServiceException, ) RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] + class ApiClient: """Generic API client for OpenAPI client library builds. @@ -60,25 +61,19 @@ class ApiClient: PRIMITIVE_TYPES = (float, bool, bytes, str, int) NATIVE_TYPES_MAPPING = { - 'int': int, - 'long': int, # TODO remove as only py3 is supported? - 'float': float, - 'str': str, - 'bool': bool, - 'date': datetime.date, - 'datetime': datetime.datetime, - 'decimal': decimal.Decimal, - 'object': object, + "int": int, + "long": int, # TODO remove as only py3 is supported? + "float": float, + "str": str, + "bool": bool, + "date": datetime.date, + "datetime": datetime.datetime, + "decimal": decimal.Decimal, + "object": object, } _pool = None - def __init__( - self, - configuration=None, - header_name=None, - header_value=None, - cookie=None - ) -> None: + def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: # use default configuration if none is provided if configuration is None: configuration = Configuration.get_default() @@ -90,7 +85,7 @@ def __init__( self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = 'OpenAPI-Generator/1.0.0/python' + self.user_agent = "OpenAPI-Generator/1.0.0/python" self.client_side_validation = configuration.client_side_validation def __enter__(self): @@ -102,16 +97,15 @@ def __exit__(self, exc_type, exc_value, traceback): @property def user_agent(self): """User agent for this API client""" - return self.default_headers['User-Agent'] + return self.default_headers["User-Agent"] @user_agent.setter def user_agent(self, value): - self.default_headers['User-Agent'] = value + self.default_headers["User-Agent"] = value def set_default_header(self, header_name, header_value): self.default_headers[header_name] = header_value - _default = None @classmethod @@ -147,12 +141,12 @@ def param_serialize( header_params=None, body=None, post_params=None, - files=None, auth_settings=None, + files=None, + auth_settings=None, collection_formats=None, _host=None, - _request_auth=None + _request_auth=None, ) -> RequestSerialized: - """Builds the HTTP request params needed by the request. :param method: Method to call. :param resource_path: Path to method endpoint. @@ -181,47 +175,30 @@ def param_serialize( header_params = header_params or {} header_params.update(self.default_headers) if self.cookie: - header_params['Cookie'] = self.cookie + header_params["Cookie"] = self.cookie if header_params: header_params = self.sanitize_for_serialization(header_params) - header_params = dict( - self.parameters_to_tuples(header_params,collection_formats) - ) + header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples( - path_params, - collection_formats - ) + path_params = self.parameters_to_tuples(path_params, collection_formats) for k, v in path_params: # specified safe chars, encode everything - resource_path = resource_path.replace( - '{%s}' % k, - quote(str(v), safe=config.safe_chars_for_path_param) - ) + resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) # post parameters if post_params or files: post_params = post_params if post_params else [] post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples( - post_params, - collection_formats - ) + post_params = self.parameters_to_tuples(post_params, collection_formats) if files: post_params.extend(self.files_parameters(files)) # auth setting self.update_params_for_auth( - header_params, - query_params, - auth_settings, - resource_path, - method, - body, - request_auth=_request_auth + header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth ) # body @@ -238,23 +215,13 @@ def param_serialize( # query parameters if query_params: query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query( - query_params, - collection_formats - ) + url_query = self.parameters_to_url_query(query_params, collection_formats) url += "?" + url_query return method, url, header_params, body, post_params - def call_api( - self, - method, - url, - header_params=None, - body=None, - post_params=None, - _request_timeout=None + self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None ) -> rest.RESTResponse: """Makes the HTTP request (synchronous) :param method: Method to call. @@ -271,10 +238,12 @@ def call_api( try: # perform request and return response response_data = self.rest_client.request( - method, url, + method, + url, headers=header_params, - body=body, post_params=post_params, - _request_timeout=_request_timeout + body=body, + post_params=post_params, + _request_timeout=_request_timeout, ) except ApiException as e: @@ -283,9 +252,7 @@ def call_api( return response_data def response_deserialize( - self, - response_data: rest.RESTResponse, - response_types_map: Optional[Dict[str, ApiResponseT]]=None + self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None ) -> ApiResponse[ApiResponseT]: """Deserializes response into an object. :param response_data: RESTResponse object to be deserialized. @@ -311,7 +278,7 @@ def response_deserialize( return_data = self.__deserialize_file(response_data) elif response_type is not None: match = None - content_type = response_data.getheader('content-type') + content_type = response_data.getheader("content-type") if content_type is not None: match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) encoding = match.group(1) if match else "utf-8" @@ -326,10 +293,10 @@ def response_deserialize( ) return ApiResponse( - status_code = response_data.status, - data = return_data, - headers = response_data.getheaders(), - raw_data = response_data.data + status_code=response_data.status, + data=return_data, + headers=response_data.getheaders(), + raw_data=response_data.data, ) def sanitize_for_serialization(self, obj): @@ -357,13 +324,9 @@ def sanitize_for_serialization(self, obj): elif isinstance(obj, self.PRIMITIVE_TYPES): return obj elif isinstance(obj, list): - return [ - self.sanitize_for_serialization(sub_obj) for sub_obj in obj - ] + return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] elif isinstance(obj, tuple): - return tuple( - self.sanitize_for_serialization(sub_obj) for sub_obj in obj - ) + return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) elif isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() elif isinstance(obj, decimal.Decimal): @@ -377,15 +340,12 @@ def sanitize_for_serialization(self, obj): # and attributes which value is not None. # Convert attribute name to json key in # model definition for request. - if hasattr(obj, 'to_dict') and callable(getattr(obj, 'to_dict')): + if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): obj_dict = obj.to_dict() else: obj_dict = obj.__dict__ - return { - key: self.sanitize_for_serialization(val) - for key, val in obj_dict.items() - } + return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): """Deserializes response into an object. @@ -404,18 +364,15 @@ def deserialize(self, response_text: str, response_type: str, content_type: Opti data = json.loads(response_text) except ValueError: data = response_text - elif re.match(r'^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)', content_type, re.IGNORECASE): + elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): if response_text == "": data = "" else: data = json.loads(response_text) - elif re.match(r'^text\/[a-z.+-]+\s*(;|$)', content_type, re.IGNORECASE): + elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): data = response_text else: - raise ApiException( - status=0, - reason="Unsupported content type: {0}".format(content_type) - ) + raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) return self.__deserialize(data, response_type) @@ -431,19 +388,17 @@ def __deserialize(self, data, klass): return None if isinstance(klass, str): - if klass.startswith('List['): - m = re.match(r'List\[(.*)]', klass) + if klass.startswith("List["): + m = re.match(r"List\[(.*)]", klass) assert m is not None, "Malformed List type definition" sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) - for sub_data in data] + return [self.__deserialize(sub_data, sub_kls) for sub_data in data] - if klass.startswith('Dict['): - m = re.match(r'Dict\[([^,]*), (.*)]', klass) + if klass.startswith("Dict["): + m = re.match(r"Dict\[([^,]*), (.*)]", klass) assert m is not None, "Malformed Dict type definition" sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) - for k, v in data.items()} + return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} # convert str to class if klass in self.NATIVE_TYPES_MAPPING: @@ -479,19 +434,18 @@ def parameters_to_tuples(self, params, collection_formats): for k, v in params.items() if isinstance(params, dict) else params: if k in collection_formats: collection_format = collection_formats[k] - if collection_format == 'multi': + if collection_format == "multi": new_params.extend((k, value) for value in v) else: - if collection_format == 'ssv': - delimiter = ' ' - elif collection_format == 'tsv': - delimiter = '\t' - elif collection_format == 'pipes': - delimiter = '|' + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" else: # csv is the default - delimiter = ',' - new_params.append( - (k, delimiter.join(str(value) for value in v))) + delimiter = "," + new_params.append((k, delimiter.join(str(value) for value in v))) else: new_params.append((k, v)) return new_params @@ -516,20 +470,18 @@ def parameters_to_url_query(self, params, collection_formats): if k in collection_formats: collection_format = collection_formats[k] - if collection_format == 'multi': + if collection_format == "multi": new_params.extend((k, str(value)) for value in v) else: - if collection_format == 'ssv': - delimiter = ' ' - elif collection_format == 'tsv': - delimiter = '\t' - elif collection_format == 'pipes': - delimiter = '|' + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" else: # csv is the default - delimiter = ',' - new_params.append( - (k, delimiter.join(quote(str(value)) for value in v)) - ) + delimiter = "," + new_params.append((k, delimiter.join(quote(str(value)) for value in v))) else: new_params.append((k, quote(str(v)))) @@ -547,7 +499,7 @@ def files_parameters( params = [] for k, v in files.items(): if isinstance(v, str): - with open(v, 'rb') as f: + with open(v, "rb") as f: filename = os.path.basename(f.name) filedata = f.read() elif isinstance(v, bytes): @@ -561,13 +513,8 @@ def files_parameters( continue else: raise ValueError("Unsupported file value") - mimetype = ( - mimetypes.guess_type(filename)[0] - or 'application/octet-stream' - ) - params.append( - tuple([k, tuple([filename, filedata, mimetype])]) - ) + mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + params.append(tuple([k, tuple([filename, filedata, mimetype])])) return params def select_header_accept(self, accepts: List[str]) -> Optional[str]: @@ -580,7 +527,7 @@ def select_header_accept(self, accepts: List[str]) -> Optional[str]: return None for accept in accepts: - if re.search('json', accept, re.IGNORECASE): + if re.search("json", accept, re.IGNORECASE): return accept return accepts[0] @@ -595,20 +542,13 @@ def select_header_content_type(self, content_types): return None for content_type in content_types: - if re.search('json', content_type, re.IGNORECASE): + if re.search("json", content_type, re.IGNORECASE): return content_type return content_types[0] def update_params_for_auth( - self, - headers, - queries, - auth_settings, - resource_path, - method, - body, - request_auth=None + self, headers, queries, auth_settings, resource_path, method, body, request_auth=None ) -> None: """Updates header and query params based on authentication setting. @@ -626,36 +566,14 @@ def update_params_for_auth( return if request_auth: - self._apply_auth_params( - headers, - queries, - resource_path, - method, - body, - request_auth - ) + self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) else: for auth in auth_settings: auth_setting = self.configuration.auth_settings().get(auth) if auth_setting: - self._apply_auth_params( - headers, - queries, - resource_path, - method, - body, - auth_setting - ) - - def _apply_auth_params( - self, - headers, - queries, - resource_path, - method, - body, - auth_setting - ) -> None: + self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) + + def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: """Updates the request parameters based on a single auth_setting :param headers: Header parameters dict to be updated. @@ -666,17 +584,15 @@ def _apply_auth_params( The object type is the return value of sanitize_for_serialization(). :param auth_setting: auth settings for the endpoint """ - if auth_setting['in'] == 'cookie': - headers['Cookie'] = auth_setting['value'] - elif auth_setting['in'] == 'header': - if auth_setting['type'] != 'http-signature': - headers[auth_setting['key']] = auth_setting['value'] - elif auth_setting['in'] == 'query': - queries.append((auth_setting['key'], auth_setting['value'])) + if auth_setting["in"] == "cookie": + headers["Cookie"] = auth_setting["value"] + elif auth_setting["in"] == "header": + if auth_setting["type"] != "http-signature": + headers[auth_setting["key"]] = auth_setting["value"] + elif auth_setting["in"] == "query": + queries.append((auth_setting["key"], auth_setting["value"])) else: - raise ApiValueError( - 'Authentication token must be in `query` or `header`' - ) + raise ApiValueError("Authentication token must be in `query` or `header`") def __deserialize_file(self, response): """Deserializes body to file @@ -696,10 +612,7 @@ def __deserialize_file(self, response): content_disposition = response.getheader("Content-Disposition") if content_disposition: - m = re.search( - r'filename=[\'"]?([^\'"\s]+)[\'"]?', - content_disposition - ) + m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) assert m is not None, "Unexpected 'content-disposition' header value" filename = m.group(1) path = os.path.join(os.path.dirname(path), filename) @@ -742,10 +655,7 @@ def __deserialize_date(self, string): except ImportError: return string except ValueError: - raise rest.ApiException( - status=0, - reason="Failed to parse `{0}` as date object".format(string) - ) + raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) def __deserialize_datetime(self, string): """Deserializes string to datetime. @@ -760,13 +670,7 @@ def __deserialize_datetime(self, string): except ImportError: return string except ValueError: - raise rest.ApiException( - status=0, - reason=( - "Failed to parse `{0}` as datetime object" - .format(string) - ) - ) + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) def __deserialize_enum(self, data, klass): """Deserializes primitive type to enum. @@ -778,13 +682,7 @@ def __deserialize_enum(self, data, klass): try: return klass(data) except ValueError: - raise rest.ApiException( - status=0, - reason=( - "Failed to parse `{0}` as `{1}`" - .format(data, klass) - ) - ) + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) def __deserialize_model(self, data, klass): """Deserializes list or dict to model. diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py index 9bc7c11..1ce1372 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -6,6 +6,7 @@ T = TypeVar("T") + class ApiResponse(BaseModel, Generic[T]): """ API response object @@ -16,6 +17,4 @@ class ApiResponse(BaseModel, Generic[T]): data: T = Field(description="Deserialized data given the data type") raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - model_config = { - "arbitrary_types_allowed": True - } + model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py index 0b76ea2..2e80369 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -23,11 +23,19 @@ import http.client as httplib JSON_SCHEMA_VALIDATION_KEYWORDS = { - 'multipleOf', 'maximum', 'exclusiveMaximum', - 'minimum', 'exclusiveMinimum', 'maxLength', - 'minLength', 'pattern', 'maxItems', 'minItems' + "multipleOf", + "maximum", + "exclusiveMaximum", + "minimum", + "exclusiveMinimum", + "maxLength", + "minLength", + "pattern", + "maxItems", + "minItems", } + class Configuration: """This class contains various settings of the API client. @@ -63,20 +71,25 @@ class Configuration: _default = None - def __init__(self, host=None, - api_key=None, api_key_prefix=None, - username=None, password=None, - access_token=None, - server_index=None, server_variables=None, - server_operation_index=None, server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor - """ + def __init__( + self, + host=None, + api_key=None, + api_key_prefix=None, + username=None, + password=None, + access_token=None, + server_index=None, + server_variables=None, + server_operation_index=None, + server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor""" self._base_path = "http://localhost" if host is None else host """Default Base url """ @@ -122,7 +135,7 @@ def __init__(self, host=None, """ self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = '%(asctime)s %(levelname)s %(message)s' + self.logger_format = "%(asctime)s %(levelname)s %(message)s" """Log format """ self.logger_stream_handler = None @@ -177,7 +190,7 @@ def __init__(self, host=None, self.proxy_headers = None """Proxy headers """ - self.safe_chars_for_path_param = '' + self.safe_chars_for_path_param = "" """Safe chars for path_param """ self.retries = retries @@ -203,7 +216,7 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k not in ('logger', 'logger_file_handler'): + if k not in ("logger", "logger_file_handler"): setattr(result, k, copy.deepcopy(v, memo)) # shallow copy of loggers result.logger = copy.copy(self.logger) @@ -363,9 +376,7 @@ def get_basic_auth_token(self): password = "" if self.password is not None: password = self.password - return urllib3.util.make_headers( - basic_auth=username + ':' + password - ).get('authorization') + return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") def auth_settings(self): """Gets Auth Settings dict for api client. @@ -380,12 +391,13 @@ def to_debug_report(self): :return: The report for debugging. """ - return "Python SDK Debug Report:\n"\ - "OS: {env}\n"\ - "Python Version: {pyversion}\n"\ - "Version of the API: 1.0.0\n"\ - "SDK Package Version: 1.0.0".\ - format(env=sys.platform, pyversion=sys.version) + return ( + "Python SDK Debug Report:\n" + "OS: {env}\n" + "Python Version: {pyversion}\n" + "Version of the API: 1.0.0\n" + "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) + ) def get_host_settings(self): """Gets an array of host settings @@ -394,8 +406,8 @@ def get_host_settings(self): """ return [ { - 'url': "", - 'description': "No description provided", + "url": "", + "description": "No description provided", } ] @@ -417,22 +429,20 @@ def get_host_from_settings(self, index, variables=None, servers=None): except IndexError: raise ValueError( "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers))) + "Must be less than {1}".format(index, len(servers)) + ) - url = server['url'] + url = server["url"] # go through variables and replace placeholders - for variable_name, variable in server.get('variables', {}).items(): - used_value = variables.get( - variable_name, variable['default_value']) + for variable_name, variable in server.get("variables", {}).items(): + used_value = variables.get(variable_name, variable["default_value"]) - if 'enum_values' in variable \ - and used_value not in variable['enum_values']: + if "enum_values" in variable and used_value not in variable["enum_values"]: raise ValueError( "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format( - variable_name, variables[variable_name], - variable['enum_values'])) + "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) + ) url = url.replace("{" + variable_name + "}", used_value) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py index a5adf00..5dbd4b0 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -1,27 +1,27 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 from typing import Any, Optional from typing_extensions import Self + class OpenApiException(Exception): """The base exception class for all OpenAPIExceptions""" class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, - key_type=None) -> None: - """ Raises an exception for TypeErrors + def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: + """Raises an exception for TypeErrors Args: msg (str): the exception message @@ -104,9 +104,9 @@ def __init__(self, msg, path_to_item=None) -> None: class ApiException(OpenApiException): def __init__( - self, - status=None, - reason=None, + self, + status=None, + reason=None, http_resp=None, *, body: Optional[str] = None, @@ -125,17 +125,17 @@ def __init__( self.reason = http_resp.reason if self.body is None: try: - self.body = http_resp.data.decode('utf-8') + self.body = http_resp.data.decode("utf-8") except Exception: pass self.headers = http_resp.getheaders() @classmethod def from_response( - cls, - *, - http_resp, - body: Optional[str], + cls, + *, + http_resp, + body: Optional[str], data: Optional[Any], ) -> Self: if http_resp.status == 400: @@ -156,11 +156,9 @@ def from_response( def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n"\ - "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) if self.headers: - error_message += "HTTP response headers: {0}\n".format( - self.headers) + error_message += "HTTP response headers: {0}\n".format(self.headers) if self.data or self.body: error_message += "HTTP response body: {0}\n".format(self.data or self.body) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py index 022896f..e0ef19f 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -2,19 +2,18 @@ # flake8: noqa """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 # import models into model package from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py index b797b12..cd0f9c7 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,20 +19,16 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - IMAGE = 'IMAGE' - TABLE = 'TABLE' - TEXT = 'TEXT' + IMAGE = "IMAGE" + TABLE = "TABLE" + TEXT = "TEXT" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py index db65003..4f9f9af 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -23,10 +23,10 @@ from typing import Optional, Set from typing_extensions import Self + class ExtractionRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -38,7 +38,6 @@ class ExtractionRequest(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -63,8 +62,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -77,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_kwargs in self.kwargs: if _item_kwargs: _items.append(_item_kwargs.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -89,11 +87,15 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index 95a0fdb..a428183 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -24,10 +24,12 @@ from typing import Optional, Set from typing_extensions import Self + class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 + """ # noqa: E501 + metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -39,7 +41,6 @@ class InformationPiece(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -64,8 +65,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -78,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_metadata in self.metadata: if _item_metadata: _items.append(_item_metadata.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -90,11 +90,15 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py index 553288b..2a77b65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -22,10 +22,10 @@ from typing import Optional, Set from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -36,7 +36,6 @@ class KeyValuePair(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -61,8 +60,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -72,12 +70,12 @@ def to_dict(self) -> Dict[str, Any]: # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -90,10 +88,5 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py index 32b1c3a..60fc660 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -78,22 +78,19 @@ def __init__(self, configuration) -> None: "key_file": configuration.key_file, } if configuration.assert_hostname is not None: - pool_args['assert_hostname'] = ( - configuration.assert_hostname - ) + pool_args["assert_hostname"] = configuration.assert_hostname if configuration.retries is not None: - pool_args['retries'] = configuration.retries + pool_args["retries"] = configuration.retries if configuration.tls_server_name: - pool_args['server_hostname'] = configuration.tls_server_name - + pool_args["server_hostname"] = configuration.tls_server_name if configuration.socket_options is not None: - pool_args['socket_options'] = configuration.socket_options + pool_args["socket_options"] = configuration.socket_options if configuration.connection_pool_maxsize is not None: - pool_args['maxsize'] = configuration.connection_pool_maxsize + pool_args["maxsize"] = configuration.connection_pool_maxsize # https pool manager self.pool_manager: urllib3.PoolManager @@ -101,6 +98,7 @@ def __init__(self, configuration) -> None: if configuration.proxy: if is_socks_proxy_url(configuration.proxy): from urllib3.contrib.socks import SOCKSProxyManager + pool_args["proxy_url"] = configuration.proxy pool_args["headers"] = configuration.proxy_headers self.pool_manager = SOCKSProxyManager(**pool_args) @@ -111,15 +109,7 @@ def __init__(self, configuration) -> None: else: self.pool_manager = urllib3.PoolManager(**pool_args) - def request( - self, - method, - url, - headers=None, - body=None, - post_params=None, - _request_timeout=None - ): + def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): """Perform requests. :param method: http request method @@ -135,20 +125,10 @@ def request( (connection, read) timeouts. """ method = method.upper() - assert method in [ - 'GET', - 'HEAD', - 'DELETE', - 'POST', - 'PUT', - 'PATCH', - 'OPTIONS' - ] + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] if post_params and body: - raise ApiValueError( - "body parameter cannot be used with post_params parameter." - ) + raise ApiValueError("body parameter cannot be used with post_params parameter.") post_params = post_params or {} headers = headers or {} @@ -157,37 +137,23 @@ def request( if _request_timeout: if isinstance(_request_timeout, (int, float)): timeout = urllib3.Timeout(total=_request_timeout) - elif ( - isinstance(_request_timeout, tuple) - and len(_request_timeout) == 2 - ): - timeout = urllib3.Timeout( - connect=_request_timeout[0], - read=_request_timeout[1] - ) + elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: + timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: + if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: # no content type provided or payload is json - content_type = headers.get('Content-Type') - if ( - not content_type - or re.search('json', content_type, re.IGNORECASE) - ): + content_type = headers.get("Content-Type") + if not content_type or re.search("json", content_type, re.IGNORECASE): request_body = None if body is not None: request_body = json.dumps(body) r = self.pool_manager.request( - method, - url, - body=request_body, - timeout=timeout, - headers=headers, - preload_content=False + method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False ) - elif content_type == 'application/x-www-form-urlencoded': + elif content_type == "application/x-www-form-urlencoded": r = self.pool_manager.request( method, url, @@ -195,15 +161,15 @@ def request( encode_multipart=False, timeout=timeout, headers=headers, - preload_content=False + preload_content=False, ) - elif content_type == 'multipart/form-data': + elif content_type == "multipart/form-data": # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. - del headers['Content-Type'] + del headers["Content-Type"] # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a,b) for a, b in post_params] + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] r = self.pool_manager.request( method, url, @@ -211,29 +177,20 @@ def request( encode_multipart=True, timeout=timeout, headers=headers, - preload_content=False + preload_content=False, ) # Pass a `string` parameter directly in the body to support # other content types than JSON when `body` argument is # provided in serialized form. elif isinstance(body, str) or isinstance(body, bytes): r = self.pool_manager.request( - method, - url, - body=body, - timeout=timeout, - headers=headers, - preload_content=False + method, url, body=body, timeout=timeout, headers=headers, preload_content=False ) - elif headers['Content-Type'].startswith('text/') and isinstance(body, bool): + elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): request_body = "true" if body else "false" r = self.pool_manager.request( - method, - url, - body=request_body, - preload_content=False, - timeout=timeout, - headers=headers) + method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers + ) else: # Cannot generate the request from given parameters msg = """Cannot prepare a request message for provided @@ -243,12 +200,7 @@ def request( # For `GET`, `HEAD` else: r = self.pool_manager.request( - method, - url, - fields={}, - timeout=timeout, - headers=headers, - preload_content=False + method, url, fields={}, timeout=timeout, headers=headers, preload_content=False ) except urllib3.exceptions.SSLError as e: msg = "\n".join([type(e).__name__, str(e)]) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py index 9704fc8..5a78d9b 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + class TestContentType(unittest.TestCase): """ContentType unit test stubs""" @@ -29,5 +30,6 @@ def testContentType(self): """Test ContentType""" # inst = ContentType() -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py index fd48e16..2f8f1bd 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + class TestExtractionRequest(unittest.TestCase): """ExtractionRequest unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> ExtractionRequest: """Test ExtractionRequest - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `ExtractionRequest` """ model = ExtractionRequest() @@ -52,5 +53,6 @@ def testExtractionRequest(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py index e76b68d..f39a507 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -27,11 +27,9 @@ def tearDown(self) -> None: pass def test_extract_from_file_post(self) -> None: - """Test case for extract_from_file_post - - """ + """Test case for extract_from_file_post""" pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py index 0661af0..479c858 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + class TestInformationPiece(unittest.TestCase): """InformationPiece unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> InformationPiece: """Test InformationPiece - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `InformationPiece` """ model = InformationPiece() @@ -56,5 +57,6 @@ def testInformationPiece(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py index 695ebb9..0ddc864 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + class TestKeyValuePair(unittest.TestCase): """KeyValuePair unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> KeyValuePair: """Test KeyValuePair - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `KeyValuePair` """ model = KeyValuePair() @@ -48,5 +49,6 @@ def testKeyValuePair(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index d00dfce..fedce07 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from admin_api_lib.models.status import Status + try: from typing import Self except ImportError: from typing_extensions import Self + class DocumentStatus(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + name: StrictStr status: Status __properties: ClassVar[List[str]] = ["name", "status"] @@ -42,7 +41,6 @@ class DocumentStatus(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,8 +67,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -84,10 +81,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "name": obj.get("name"), - "status": obj.get("status") - }) + _obj = cls.model_validate({"name": obj.get("name"), "status": obj.get("status")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/models/extra_models.py b/admin-api-lib/src/admin_api_lib/models/extra_models.py index a3a283f..f0588d2 100644 --- a/admin-api-lib/src/admin_api_lib/models/extra_models.py +++ b/admin-api-lib/src/admin_api_lib/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 8419cfa..2d2fe5e 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,18 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +40,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,19 +66,18 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -93,10 +90,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 2e0de2c..5c7836f 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,21 +26,17 @@ class Status(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - UPLOADING = 'UPLOADING' - PROCESSING = 'PROCESSING' - READY = 'READY' - ERROR = 'ERROR' + UPLOADING = "UPLOADING" + PROCESSING = "PROCESSING" + READY = "READY" + ERROR = "ERROR" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of Status from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py index f76b987..e90690f 100644 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from admin_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class UploadSource(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -43,7 +42,6 @@ class UploadSource(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index d949eb7..262f11b 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -7,10 +7,10 @@ servers: paths: /extract: post: - operationId: extract_from_file_post + operationId: extract requestBody: content: - application/json: + multipart/form-data: schema: $ref: '#/components/schemas/extraction_request' required: true @@ -24,7 +24,7 @@ paths: type: array description: List of extracted information. "422": - description: Body is not a valid PDF. + description: Body is not a valid source. "500": description: Something somewhere went terribly wrong. tags: @@ -87,19 +87,19 @@ components: file: description: "" format: binary - title: file type: string type: description: "" - title: type type: string kwargs: description: "" items: $ref: '#/components/schemas/key_value_pair' - title: kwargs type: array + name: + description: "" + type: string required: + - name - type - title: extraction_request type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 6246635..eee5ada 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -23,9 +23,11 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from typing import Any, List -from extractor_api_lib.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from fastapi import Request, Response, UploadFile +from typing import Any, List, Optional, Tuple, Union from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() @@ -39,15 +41,18 @@ "/extract", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, - 422: {"description": "Body is not a valid PDF."}, + 422: {"description": "Body is not a valid source."}, 500: {"description": "Something somewhere went terribly wrong."}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract_from_file_post( - extraction_request: ExtractionRequest = Body(None, description=""), +async def extract( + type: StrictStr = Form(None, description=""), + name: StrictStr = Form(None, description=""), + file: Optional[UploadFile] = Form(None, description=""), + kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) + return await BaseExtractorApi.subclasses[0]().extract(type, name, file, kwargs) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index a0b1fb5..f7a7cf0 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,9 +2,11 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from typing import Any, List -from extractor_api_lib.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from typing import Any, List, Optional, Tuple, Union +from fastapi import Request, Response, UploadFile from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair class BaseExtractorApi: @@ -13,8 +15,11 @@ class BaseExtractorApi: def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - async def extract_from_file_post( + + async def extract( self, - extraction_request: ExtractionRequest, - ) -> List[InformationPiece]: - ... + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + ) -> List[InformationPiece]: ... diff --git a/extractor-api-lib/src/extractor_api_lib/models/content_type.py b/extractor-api-lib/src/extractor_api_lib/models/content_type.py index 195f424..ff7be41 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/content_type.py +++ b/extractor-api-lib/src/extractor_api_lib/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,20 +26,16 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - IMAGE = 'IMAGE' - TABLE = 'TABLE' - TEXT = 'TEXT' + IMAGE = "IMAGE" + TABLE = "TABLE" + TEXT = "TEXT" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py index a3a283f..f0588d2 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 437442f..8917378 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from extractor_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class ExtractionRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -43,7 +42,6 @@ class ExtractionRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 98261ff..8890a13 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,21 +18,22 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from extractor_api_lib.models.content_type import ContentType from extractor_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 + """ # noqa: E501 + metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -44,7 +45,6 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -71,8 +71,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -81,7 +80,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -93,11 +92,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index 0cf865e..f751313 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,18 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +40,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,19 +66,18 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -93,10 +90,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index 425f48c..64597dd 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -12,7 +12,7 @@ from fastapi import ( # noqa: F401 APIRouter, - BackgroundTasks, + BackgroundTasks, Body, Cookie, Depends, @@ -21,7 +21,7 @@ HTTPException, Path, Query, - Request, + Request, Response, Security, status, @@ -57,6 +57,7 @@ async def _disconnected(request: Request) -> None: except CancelledError: break + @router.post( "/chat/{session_id}", responses={ @@ -69,7 +70,9 @@ async def _disconnected(request: Request) -> None: async def chat( request: Request, session_id: StrictStr = Path(..., description=""), - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body(None, description="Chat with RAG."), + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body( + None, description="Chat with RAG." + ), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -125,8 +128,7 @@ async def chat( tags=["rag"], response_model_by_alias=True, ) -async def evaluate( -) -> None: +async def evaluate() -> None: """ Asynchronously evaluate the RAG. @@ -167,7 +169,7 @@ async def remove_information_piece( Returns ------- None - """ + """ if not BaseRagApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -200,7 +202,7 @@ async def upload_information_piece( Returns ------- None - """ + """ if not BaseRagApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 70d1406..0b53f4b 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -24,11 +24,13 @@ class BaseRagApi: subclasses : ClassVar[Tuple] A tuple that holds all subclasses of BaseRagApi. """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseRagApi.subclasses = BaseRagApi.subclasses + (cls,) + async def chat( self, session_id: StrictStr, @@ -52,7 +54,6 @@ async def chat( The chat response if the chat task completes successfully, otherwise None. """ - async def evaluate( self, ) -> None: @@ -64,7 +65,6 @@ async def evaluate( None """ - async def remove_information_piece( self, delete_request: DeleteRequest, @@ -84,7 +84,6 @@ async def remove_information_piece( None """ - async def upload_information_piece( self, information_piece: List[InformationPiece], diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 71e2e8c..9087afe 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List from rag_core_api.models.chat_history_message import ChatHistoryMessage + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatHistory(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + messages: List[ChatHistoryMessage] __properties: ClassVar[List[str]] = ["messages"] @@ -41,7 +40,6 @@ class ChatHistory(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +66,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in messages (list) @@ -78,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.messages: if _item: _items.append(_item.to_dict()) - _dict['messages'] = _items + _dict["messages"] = _items return _dict @classmethod @@ -90,9 +87,13 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "messages": [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] if obj.get("messages") is not None else None - }) + _obj = cls.model_validate( + { + "messages": ( + [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] + if obj.get("messages") is not None + else None + ) + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index 59da140..c9d782b 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.chat_role import ChatRole + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatHistoryMessage(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + role: ChatRole message: StrictStr __properties: ClassVar[List[str]] = ["role", "message"] @@ -42,7 +41,6 @@ class ChatHistoryMessage(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,8 +67,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -84,10 +81,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "role": obj.get("role"), - "message": obj.get("message") - }) + _obj = cls.model_validate({"role": obj.get("role"), "message": obj.get("message")}) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 9e28631..66090ef 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List, Optional from rag_core_api.models.chat_history import ChatHistory + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + history: Optional[ChatHistory] = None message: StrictStr __properties: ClassVar[List[str]] = ["history", "message"] @@ -42,7 +41,6 @@ class ChatRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,13 +67,12 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of history if self.history: - _dict['history'] = self.history.to_dict() + _dict["history"] = self.history.to_dict() return _dict @classmethod @@ -87,10 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, - "message": obj.get("message") - }) + _obj = cls.model_validate( + { + "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "message": obj.get("message"), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index 6a8daad..ba8c6b1 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.information_piece import InformationPiece + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatResponse(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + answer: StrictStr finish_reason: StrictStr = Field(description=" ") citations: List[InformationPiece] @@ -43,7 +42,6 @@ class ChatResponse(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in citations (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.citations: if _item: _items.append(_item.to_dict()) - _dict['citations'] = _items + _dict["citations"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "answer": obj.get("answer"), - "finish_reason": obj.get("finish_reason"), - "citations": [InformationPiece.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None - }) + _obj = cls.model_validate( + { + "answer": obj.get("answer"), + "finish_reason": obj.get("finish_reason"), + "citations": ( + [InformationPiece.from_dict(_item) for _item in obj.get("citations")] + if obj.get("citations") is not None + else None + ), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index d0bef70..7e1c88d 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,19 +26,15 @@ class ChatRole(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - USER = 'user' - ASSISTANT = 'assistant' + USER = "user" + ASSISTANT = "assistant" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ChatRole from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index df72d7d..7f4d874 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,21 +26,17 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - TEXT = 'TEXT' - IMAGE = 'IMAGE' - TABLE = 'TABLE' - SUMMARY = 'SUMMARY' + TEXT = "TEXT" + IMAGE = "IMAGE" + TABLE = "TABLE" + SUMMARY = "SUMMARY" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 2c3592c..8b40339 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional from rag_core_api.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class DeleteRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + metadata: Optional[List[KeyValuePair]] = None __properties: ClassVar[List[str]] = ["metadata"] @@ -41,7 +40,6 @@ class DeleteRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +66,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -78,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -90,9 +87,13 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ) + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/extra_models.py b/rag-core-api/src/rag_core_api/models/extra_models.py index a3a283f..f0588d2 100644 --- a/rag-core-api/src/rag_core_api/models/extra_models.py +++ b/rag-core-api/src/rag_core_api/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index 28d5115..dfe8a42 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,22 +18,25 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class InformationPiece(BaseModel): """ Uploading a json with chunks and metadata. - """ # noqa: E501 - metadata: List[KeyValuePair] = Field(description="The metadata of the documents that are stored in the vectordatabase.") + """ # noqa: E501 + + metadata: List[KeyValuePair] = Field( + description="The metadata of the documents that are stored in the vectordatabase." + ) page_content: StrictStr = Field(description="The content of the document") type: ContentType __properties: ClassVar[List[str]] = ["metadata", "page_content", "type"] @@ -44,7 +47,6 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -71,8 +73,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -81,7 +82,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -93,11 +94,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index b9654c3..3079959 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,20 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): """ The key value pair. - """ # noqa: E501 + """ # noqa: E501 + key: StrictStr value: StrictStr = Field(description=" ") __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +42,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -83,10 +82,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - From 1a7b9d700d72f8f92c69f3ecac95af0a442e9ce7 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 15:40:25 +0200 Subject: [PATCH 03/43] switch to one uploader for all types --- .../api_endpoints/source_uploader.py | 14 +- .../src/admin_api_lib/apis/admin_api.py | 3 +- .../src/admin_api_lib/apis/admin_api_base.py | 1 - .../src/admin_api_lib/dependency_container.py | 22 +-- .../src/admin_api_lib/impl/admin_api.py | 56 +++----- .../api_endpoints/default_source_uploader.py | 125 ++++++++++++++++++ 6 files changed, 158 insertions(+), 63 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 2cfbf2f..0c9b73e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,8 +1,10 @@ -from dataclasses import Field -from typing_extensions import Annotated from abc import ABC, abstractmethod +from typing import Optional -from admin_api_lib.models.upload_source import UploadSource +from pydantic import StrictStr +from fastapi import UploadFile + +from admin_api_lib.models.key_value_pair import KeyValuePair class SourceUploader(ABC): @@ -10,5 +12,9 @@ class SourceUploader(ABC): @abstractmethod async def upload_source( self, - upload_source: Annotated[UploadSource, Field(description="The source to upload.")], + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 81d55f5..ccaed84 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -99,6 +99,7 @@ async def document_reference_id_get( raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) + @router.get( "/all_documents_status", responses={ @@ -116,7 +117,7 @@ async def get_all_documents_status() -> List[DocumentStatus]: ------- list[DocumentStatus] A list containing the status of all documents. - """ + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 34bce77..48e22dc 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -52,7 +52,6 @@ async def document_reference_id_get( The response object containing the document reference details. """ - async def get_all_documents_status( self, ) -> list[DocumentStatus]: diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 9079a47..4ca3b57 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,5 +1,6 @@ """Module for the DependencyContainer class.""" +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -164,23 +165,12 @@ class DependencyContainer(DeclarativeContainer): DefaultDocumentDeleter, rag_api=rag_api, file_service=file_service, key_value_store=key_value_store ) documents_status_retriever = Singleton(DefaultDocumentsStatusRetriever, key_value_store=key_value_store) - confluence_loader = Singleton( - DefaultConfluenceLoader, - extractor_api=document_extractor, - rag_api=rag_api, - key_value_store=key_value_store, - settings=confluence_settings, - information_enhancer=information_enhancer, - information_mapper=information_mapper, - chunker=chunker, - document_deleter=document_deleter, - settings_mapper=confluence_settings_mapper, - ) + document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) - document_uploader = Singleton( - DefaultDocumentUploader, - document_extractor=document_extractor, - file_service=file_service, + + source_uploader = Singleton( + DefaultSourceUploader, + extractor_api=document_extractor, rag_api=rag_api, information_enhancer=information_enhancer, information_mapper=information_mapper, diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 9c24eba..25745c5 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,7 +1,14 @@ """Module containing the implementation of the Admin API.""" +from dataclasses import Field import logging +from typing import List, Optional +from typing_extensions import Annotated +from pydantic import Field, StrictBytes, StrictStr +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.models.upload_source import UploadSource from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile @@ -85,24 +92,16 @@ async def get_all_documents_status( return await document_status_retriever.aget_all_documents_status() @inject - async def load_confluence_post( + async def upload_source( self, - confluence_loader: ConfluenceLoader = Depends(Provide[DependencyContainer.confluence_loader]), + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + request: Request, + source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - """ - Asynchronously loads a Confluence space using the provided ConfluenceLoader. - - Parameters - ---------- - confluence_loader : ConfluenceLoader - The ConfluenceLoader instance to use for loading the post. This is injected by dependency injection - (default is Depends(Provide[DependencyContainer.confluence_loader])). - - Returns - ------- - None - """ - await confluence_loader.aload_from_confluence() + await source_uploader.upload_source(str(request.base_url), type, name, file, kwargs) @inject async def document_reference_id_get( @@ -129,28 +128,3 @@ async def document_reference_id_get( The document in binary form. """ return await document_reference_retriever.adocument_reference_id_get(identification) - - @inject - async def upload_documents_post( - self, - body: UploadFile, - request: Request, - document_uploader: DocumentUploader = Depends(Provide[DependencyContainer.document_uploader]), - ) -> None: - """ - Handle the POST request to upload documents. - - Parameters - ---------- - body : UploadFile - The file to be uploaded. - request : Request - The request object containing metadata about the request. - document_uploader : DocumentUploader, optional - The document uploader dependency, by default provided by DependencyContainer. - - Returns - ------- - None - """ - await document_uploader.aupload_documents_post(body, request) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py new file mode 100644 index 0000000..1b2f31c --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -0,0 +1,125 @@ +from http.client import HTTPException +import logging +from typing import Optional +from threading import Thread +import urllib + +from pydantic import StrictStr +from fastapi import UploadFile, status +from langchain_core.documents import Document +from asyncio import run + +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name + +logger = logging.getLogger(__name__) + + +class DefaultSourceUploader(SourceUploader): + + def __init__( + self, + extractor_api: ExtractorApi, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + ): + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + + async def upload_source( + self, + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> None: + self._background_threads = [t for t in self._background_threads if t.is_alive()] + source_name = f"{type}:{sanitize_document_name(name)}" + try: + # TODO: check if document already in processing state + self._key_value_store.upsert( + source_name, Status.PROCESSING + ) # TODO: change to pipeline with timeout to error status + thread = Thread( + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, file, kwargs)) + ) + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + async def _handle_source_upload( + self, + source_name: str, + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ): + try: + information_pieces = self._extractor_api.extract(type, name, file, kwargs) + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + + chunked_documents = self._chunker.chunk(documents) + self._add_file_url(type, file, base_url, chunked_documents) + + enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + rag_information_pieces = [ + self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents + ] + + # Replace old document + await self._document_deleter.adelete_document(source_name) + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("File uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + + def _add_file_url( + self, type: StrictStr, file: Optional[UploadFile], base_url: str, chunked_documents: list[Document] + ): + if type != "file": + return + + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + for idx, chunk in enumerate(chunked_documents): + if chunk.metadata["id"] in chunk.metadata["related"]: + chunk.metadata["related"].remove(chunk.metadata["id"]) + chunk.metadata.update( + { + "chunk": idx, + "chunk_length": len(chunk.page_content), + "document_url": document_url, + } + ) From 7e4a9d0f793e47a52b1ffa3225ea503ad788b459 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 13 May 2025 15:06:49 +0200 Subject: [PATCH 04/43] extractor mostly working --- .../{confluence_extractor.py => extractor.py} | 19 ++-- .../api_endpoints/file_extractor.py | 26 ------ .../extractor_api_lib/apis/extractor_api.py | 14 +-- .../extractor_api_lib/dependency_container.py | 28 +++--- .../__init__.py | 0 .../extractors/information_extractor.py | 43 +++++++++ .../information_file_extractor.py} | 11 +-- .../impl/api_endpoints/default_extractor.py | 68 ++++++++++++++ .../api_endpoints/default_file_extractor.py | 65 -------------- .../impl/document_parser/general_extractor.py | 66 -------------- .../impl/extractor_api_impl.py | 48 ++++------ .../extractors}/__init__.py | 0 .../confluence_extractor.py} | 41 +++++---- .../file_extractors}/__init__.py | 0 .../file_extractors}/ms_docs_extractor.py | 24 ++--- .../file_extractors}/pdf_extractor.py | 18 ++-- .../file_extractors}/xml_extractor.py | 20 +++-- .../impl/extractors/general_file_extractor.py | 90 +++++++++++++++++++ .../impl/file_services/__init__.py | 0 .../internal2external_information_piece.py | 4 +- .../impl/types/extractor_types.py | 9 ++ ...piece.py => internal_information_piece.py} | 2 +- 22 files changed, 328 insertions(+), 268 deletions(-) rename extractor-api-lib/src/extractor_api_lib/api_endpoints/{confluence_extractor.py => extractor.py} (57%) delete mode 100644 extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py rename extractor-api-lib/src/extractor_api_lib/{document_parser => extractors}/__init__.py (100%) create mode 100644 extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py rename extractor-api-lib/src/extractor_api_lib/{document_parser/information_extractor.py => extractors/information_file_extractor.py} (78%) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py rename extractor-api-lib/src/extractor_api_lib/{file_services => impl/extractors}/__init__.py (100%) rename extractor-api-lib/src/extractor_api_lib/impl/{api_endpoints/default_confluence_extractor.py => extractors/confluence_extractor.py} (58%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/__init__.py (100%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/ms_docs_extractor.py (89%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/pdf_extractor.py (94%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/xml_extractor.py (83%) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py rename extractor-api-lib/src/extractor_api_lib/models/dataclasses/{information_piece.py => internal_information_piece.py} (92%) diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py similarity index 57% rename from extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py rename to extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py index d1aae80..c3f254b 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py @@ -1,16 +1,23 @@ -"""Module for the ConfluenceExtractor abstract base class.""" - from abc import ABC, abstractmethod +from typing import Optional + +from pydantic import StrictStr +from fastapi import UploadFile -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair -class ConfluenceExtractor(ABC): - """Abstract base class for extract_from_confluence endpoint.""" +class Extractor(ABC): @abstractmethod - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: + async def aextract_information( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InformationPiece]: """ Extract information from confluence, using the given confluence parameters. diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py deleted file mode 100644 index 523f159..0000000 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the FileExtractor abstract base class.""" - -from abc import ABC, abstractmethod - -from extractor_api_lib.models.extraction_request import ExtractionRequest -from extractor_api_lib.models.information_piece import InformationPiece - - -class FileExtractor(ABC): - """Abstract base class for extract_information endpoint.""" - - @abstractmethod - async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: - """ - Extract information of a document, given by the extraction_request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing the details of the document to be processed for information extraction. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the document. - """ diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index eee5ada..0cbdc2b 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Dict, List # noqa: F401 +from typing import Annotated, Dict, List # noqa: F401 import importlib import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import openapi_server.impl +import extractor_api_lib.impl from fastapi import ( # noqa: F401 APIRouter, @@ -32,7 +32,7 @@ router = APIRouter() -ns_pkg = openapi_server.impl +ns_pkg = extractor_api_lib.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @@ -48,10 +48,10 @@ response_model_by_alias=True, ) async def extract( - type: StrictStr = Form(None, description=""), - name: StrictStr = Form(None, description=""), - file: Optional[UploadFile] = Form(None, description=""), - kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), + type: Annotated[str, Form()], + name: Annotated[str, Form()], + file: Optional[UploadFile] = None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]]=None, ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index e3bcaf1..2c5c53f 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -3,16 +3,12 @@ from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import List, Singleton # noqa: WOT001 -from extractor_api_lib.impl.api_endpoints.default_confluence_extractor import ( - DefaultConfluenceExtractor, -) -from extractor_api_lib.impl.api_endpoints.default_file_extractor import ( - DefaultFileExtractor, -) -from extractor_api_lib.impl.document_parser.general_extractor import GeneralExtractor -from extractor_api_lib.impl.document_parser.ms_docs_extractor import MSDocsExtractor -from extractor_api_lib.impl.document_parser.pdf_extractor import PDFExtractor -from extractor_api_lib.impl.document_parser.xml_extractor import XMLExtractor +from extractor_api_lib.impl.api_endpoints.default_extractor import DefaultExtractor +from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor +from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor +from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor +from extractor_api_lib.impl.extractors.general_file_extractor import GeneralFileExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -40,11 +36,13 @@ class DependencyContainer(DeclarativeContainer): intern2external = Singleton(Internal2ExternalInformationPiece) langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) - all_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) + file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_extractor = Singleton(GeneralExtractor, file_service, all_extractors) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors) + confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - file_extractor = Singleton( - DefaultFileExtractor, information_extractor=general_extractor, file_service=file_service, mapper=intern2external + default_extractor = Singleton( + DefaultExtractor, + mapper=intern2external, + available_extractors=List(general_file_extractor, confluence_extractor), ) - confluence_extractor = Singleton(DefaultConfluenceExtractor, mapper=langchain_document2information_piece) diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py rename to extractor-api-lib/src/extractor_api_lib/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py new file mode 100644 index 0000000..eeaadf1 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -0,0 +1,43 @@ +"""Module for the Base class for Information extractors.""" + +from abc import ABC, abstractmethod +from typing import Optional + + +from fastapi import UploadFile +from pydantic import StrictStr + +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + + +class InformationExtractor(ABC): + """Base class for Information extractors.""" + + @property + @abstractmethod + def extractor_type(self) -> ExtractorTypes: ... + + @abstractmethod + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InternalInformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py similarity index 78% rename from extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py rename to extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 0c3c4ce..8b54f1c 100644 --- a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -3,13 +3,14 @@ from abc import ABC, abstractmethod from pathlib import Path -from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.file_services.file_service import FileService -class InformationExtractor(ABC): - """Base class for Information extractors.""" +class InformationFileExtractor(ABC): + """Base class for Information file extractors.""" def __init__(self, file_service: FileService): """Initialize the InformationExtractor. @@ -34,7 +35,7 @@ def compatible_file_types(self) -> list[FileType]: """ @abstractmethod - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from given file. diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py new file mode 100644 index 0000000..b485c1e --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py @@ -0,0 +1,68 @@ +"""Module for the DefaultFileExtractor class.""" + +import logging +from typing import Optional + +from pydantic import StrictStr +from fastapi import UploadFile + +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + + +logger = logging.getLogger(__name__) + + +class DefaultExtractor(Extractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__(self, available_extractors: list[InformationExtractor], mapper: Internal2ExternalInformationPiece): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + """ + self._mapper = mapper + self._available_extractors = available_extractors + + async def aextract_information( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ + correct_extractors = [x for x in self._available_extractors if type == x.extractor_type] + if not correct_extractors: + raise ValueError(f"No extractor found for type {type}") + results = await correct_extractors[-1].aextract_content(type, name, file, kwargs) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py deleted file mode 100644 index 787997b..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Module for the DefaultFileExtractor class.""" - -import tempfile -from pathlib import Path - -from extractor_api_lib.api_endpoints.file_extractor import FileExtractor -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.mapper.internal2external_information_piece import ( - Internal2ExternalInformationPiece, -) -from extractor_api_lib.models.extraction_request import ExtractionRequest -from extractor_api_lib.models.information_piece import InformationPiece - - -class DefaultFileExtractor(FileExtractor): - """Default implementation of the FileExtractor interface.""" - - def __init__( - self, - information_extractor: InformationExtractor, - file_service: FileService, - mapper: Internal2ExternalInformationPiece, - ): - """ - Initialize the DefaultFileExtractor. - - Parameters - ---------- - information_extractor : InformationExtractor - An instance of InformationExtractor to extract information from files. - file_service : FileService - An instance of FileService to handle file operations. - mapper : Internal2ExternalInformationPiece - An instance of Internal2ExternalInformationPiece to map internal information to external format. - """ - self.information_extractor = information_extractor - self.file_service = file_service - self.mapper = mapper - - async def aextract_information( - self, - extraction_request: ExtractionRequest, - ) -> list[InformationPiece]: - """ - Extract information from a document specified in the extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing details about the document to be extracted, including its path on S3. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the document, where each piece contains non-null page content. - """ - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / extraction_request.path_on_s3 - - with open(temp_file_path, "wb") as temp_file: - self.file_service.download_file(extraction_request.path_on_s3, temp_file) - - results = self.information_extractor.extract_content(temp_file_path) - return [self.mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py deleted file mode 100644 index 05946bf..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Module for the GeneralExtractor class.""" - -from pathlib import Path - -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece - - -class GeneralExtractor(InformationExtractor): - """A class to extract information from documents using available extractors. - - This class serves as a general extractor that utilizes a list of available - information extractors to extract content from documents. It determines the - appropriate extractor based on the file type of the document. - """ - - def __init__(self, file_service: FileService, available_extractors: list[InformationExtractor]): - """ - Initialize the GeneralExtractor. - - Parameters - ---------- - file_service : FileService - An instance of FileService to handle file operations. - available_extractors : list of InformationExtractor - A list of available information extractors to be used by the GeneralExtractor. - """ - super().__init__(file_service=file_service) - - self._available_extractors = available_extractors - - @property - def compatible_file_types(self) -> list[FileType]: - """ - List of compatible file types for the document parser. - - Returns - ------- - list[FileType] - A list containing the compatible file types. By default, it returns a list with FileType.NONE. - """ - return [FileType.NONE] - - def extract_content(self, file_path: Path) -> list[InformationPiece]: - """ - Extract content from given file. - - Parameters - ---------- - file_path : Path - Path to the file the information should be extracted from. - - Returns - ------- - list[InformationPiece] - The extracted information. - """ - file_type = str(file_path).split(".")[-1].upper() - correct_extractors = [ - x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] - ] - if not correct_extractors: - raise ValueError(f"No extractor found for file-ending {file_type}") - return correct_extractors[-1].extract_content(file_path) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index d4a3760..bfe9393 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,14 +1,16 @@ """Module for the implementation of the ExtractorApi interface.""" from dependency_injector.wiring import Provide, inject -from fastapi import Depends +from extractor_api_lib.api_endpoints.extractor import Extractor +from fastapi import Depends, UploadFile + +from pydantic import StrictStr +from typing import Optional +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor -from extractor_api_lib.api_endpoints.file_extractor import FileExtractor from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi from extractor_api_lib.dependency_container import DependencyContainer -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece @@ -16,13 +18,16 @@ class ExtractorApiImpl(BaseExtractorApi): """Implementation of the ExtractorApi interface.""" @inject - async def extract_from_file_post( + async def extract( self, - extraction_request: ExtractionRequest, - file_extractor: FileExtractor = Depends(Provide[DependencyContainer.file_extractor]), + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + extractor: Extractor = Depends(Provide[DependencyContainer.default_extractor]), ) -> list[InformationPiece]: """ - Extract information from a file based on the provided extraction request. + Extract information from a source. Parameters ---------- @@ -36,27 +41,4 @@ async def extract_from_file_post( list[InformationPiece] A list of extracted information pieces. """ - return await file_extractor.aextract_information(extraction_request) - - @inject - async def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - confluence_extractor: ConfluenceExtractor = Depends(Provide[DependencyContainer.confluence_extractor]), - ) -> list[InformationPiece]: - """ - Extract information from Confluence asynchronously. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - Parameters required to extract information from Confluence. - confluence_extractor : ConfluenceExtractor, optional - The Confluence extractor instance (default is provided by DependencyContainer). - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the configured Confluence space. - """ - return await confluence_extractor.aextract_from_confluence(confluence_parameters) + return await extractor.aextract_information(type, name, file, kwargs) diff --git a/extractor-api-lib/src/extractor_api_lib/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/file_services/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py similarity index 58% rename from extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index b752f6c..1f7c666 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -1,20 +1,24 @@ """Module for the DefaultConfluenceExtractor class.""" +from typing import Optional + +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader +from fastapi import UploadFile -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, ) -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.information_piece import InformationPiece -class DefaultConfluenceExtractor(ConfluenceExtractor): +class ConfluenceExtractor(InformationExtractor): """Default implementation of the FileExtractor interface.""" - MIN_PAGE_CONTENT_LENGTH = 10 - def __init__( self, mapper: ConfluenceLangchainDocument2InformationPiece, @@ -30,7 +34,17 @@ def __init__( """ self.mapper = mapper - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.CONFLUENCE + + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -41,17 +55,14 @@ async def aextract_from_confluence(self, confluence_parameters: ConfluenceParame Returns ------- - list[InformationPiece] + list[InternalInformationPiece] A list of information pieces extracted from Confluence. """ - self.mapper.confluence_parameters = confluence_parameters - confluence_kwargs = {} - for ckwargs in confluence_parameters.confluence_kwargs: - confluence_kwargs[ckwargs.key] = ckwargs.value - confluence_loader_parameters = confluence_parameters.model_dump() - confluence_loader_parameters["confluence_kwargs"] = confluence_kwargs + # Convert list of key value pairs to dict + confluence_loader_parameters = {x.key: x.value for x in kwargs} # Drop the document_name parameter as it is not used by the ConfluenceLoader - confluence_loader_parameters.pop("document_name", None) + if "document_name" in confluence_loader_parameters: + confluence_loader_parameters.pop("document_name", None) document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() return [self.mapper.map_document2informationpiece(x) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py similarity index 89% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index 8bb23ca..cb04681 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -6,22 +6,26 @@ from typing import Any, Optional import pandas as pd + from unstructured.documents.elements import Element from unstructured.partition.docx import partition_docx from unstructured.partition.pptx import partition_pptx -from extractor_api_lib.document_parser.information_extractor import InformationExtractor + + from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter logger = logging.getLogger(__name__) -class MSDocsExtractor(InformationExtractor): +class MSDocsExtractor(InformationFileExtractor): """Extractor for Microsoft Documents (DOCX and PPTX) using unstructured library.""" def __init__(self, file_service: FileService, dataframe_converter: DataframeConverter): @@ -50,7 +54,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.DOCX, FileType.PPTX] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from a given file based on its extension. @@ -92,8 +96,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] page_content_lines: list[tuple[str, str]] = [] current_page: int = 1 old_page: int = 1 @@ -118,7 +122,7 @@ def _process_element( self, el: Element, page_content_lines: list[tuple[str, str]], - processed_elements: list[InformationPiece], + processed_elements: list[InternalInformationPiece], document_name: str, current_page: int, ) -> None: @@ -154,7 +158,7 @@ def _process_table(self, el: Element, page_content_lines: list[tuple[str, str]]) def _create_text_piece( self, document_name: str, page: int, page_content_lines: list[tuple[str, str]] - ) -> InformationPiece: + ) -> InternalInformationPiece: content = "\n".join([content for _, content in page_content_lines]) return self._create_information_piece(document_name, page, content, ContentType.TEXT) @@ -165,7 +169,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -174,7 +178,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py similarity index 94% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index beaee14..01eb6bf 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -14,19 +14,21 @@ from pdf2image import convert_from_path from pdfplumber.page import Page -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService + from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor logger = logging.getLogger(__name__) -class PDFExtractor(InformationExtractor): +class PDFExtractor(InformationFileExtractor): """PDFExtractor is a class responsible for extracting information from PDF files. It converts PDF pages to images, identifies table/figure coordinates, and extracts @@ -86,7 +88,7 @@ def _create_information_piece( content_type: ContentType, information_id: str, additional_meta: Optional[dict] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -96,13 +98,13 @@ def _create_information_piece( } if additional_meta: metadata = metadata | additional_meta - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, ) - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """Extract content from given file. Parameters @@ -147,7 +149,7 @@ def _extract_tabluar_data( document_name: str, text_x_tolerance: int = 1, text_y_tolerance: int = 1, - ) -> list[InformationPiece]: + ) -> list[InternalInformationPiece]: return_value = [] pdfplumber_tables = page.find_tables() table_strings = [] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py similarity index 83% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index 3478cab..2a9d21c 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -5,20 +5,22 @@ from pathlib import Path from typing import Any, Optional + from unstructured.documents.elements import Element from unstructured.partition.xml import partition_xml -from extractor_api_lib.document_parser.information_extractor import InformationExtractor from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) -class XMLExtractor(InformationExtractor): +class XMLExtractor(InformationFileExtractor): """Extractor for XML documents using unstructured library.""" def __init__(self, file_service: FileService): @@ -43,7 +45,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.XML] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from an XML file and processes the elements. @@ -60,8 +62,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: elements = partition_xml(filename=file_path.as_posix(), xml_keep_tags=False) return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] content_lines: list[tuple[str, str]] = [] for el in elements: @@ -86,7 +88,7 @@ def _sanitize_text(self, text: str) -> str: text = re.sub(r"\s+", " ", text) return text.strip() - def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InformationPiece: + def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InternalInformationPiece: content = "\n".join([content for _, content in content_lines]) return self._create_information_piece(document_name, content, ContentType.TEXT) @@ -96,7 +98,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "id": hash_datetime(), @@ -104,7 +106,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py new file mode 100644 index 0000000..dfb7031 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py @@ -0,0 +1,90 @@ +"""Module for the GeneralExtractor class.""" + +import logging +from pathlib import Path +import tempfile +import traceback +from typing import Any, List, Optional + + +from pydantic import StrictStr +from fastapi import UploadFile + +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + +logger = logging.getLogger(__name__) + + +class GeneralFileExtractor(InformationExtractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor]): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + file_service : FileService + An instance of FileService to handle file operations. + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + """ + self._file_service=file_service + self._available_extractors = available_extractors + + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.FILE + + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + ) -> list[InternalInformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ + # save file on s3 + content = await file.read() + filename = file.filename + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / filename + with open(temp_file_path, "wb") as temp_file: + logger.debug("Temporary file created at %s.", temp_file_path) + temp_file.write(content) + logger.debug("Temp file created and content written.") + self._file_service.upload_file(temp_file_path, filename) + file_type = str(temp_file_path).split(".")[-1].upper() + correct_extractors = [ + x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] + ] + if not correct_extractors: + raise ValueError(f"No extractor found for file-ending {file_type}") + return await correct_extractors[-1].aextract_content(temp_file_path) + except Exception as e: + logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) + raise e diff --git a/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index a4da430..11f57b4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -2,8 +2,8 @@ from extractor_api_lib.impl.types.content_type import ContentType as InternalContentType from extractor_api_lib.models.content_type import ContentType as ExternalContentType -from extractor_api_lib.models.dataclasses.information_piece import ( - InformationPiece as InternalInformationPiece, +from extractor_api_lib.models.dataclasses.internal_information_piece import ( + InternalInformationPiece as InternalInformationPiece, ) from extractor_api_lib.models.information_piece import ( InformationPiece as ExternalInformationPiece, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py new file mode 100644 index 0000000..8a9a403 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py @@ -0,0 +1,9 @@ +from enum import StrEnum + + +class ExtractorTypes(StrEnum): + """Enum describing the type of information source.""" + + FILE = "file" + CONFLUENCE = "confluence" + NONE = "None" diff --git a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py similarity index 92% rename from extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py rename to extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py index 7bd609a..f0699e4 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py @@ -6,7 +6,7 @@ @dataclasses.dataclass -class InformationPiece: +class InternalInformationPiece: """Dataclass holding the information found in a document.""" type: ContentType # noqa: A003 # type of the information From b32d7c3e56302faa88115b96c511d678177b51b8 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Wed, 14 May 2025 15:39:00 +0200 Subject: [PATCH 05/43] it works --- admin-api-lib/poetry.lock | 2 +- admin-api-lib/pyproject.toml | 1 + .../api_endpoints/confluence_loader.py | 19 - .../api_endpoints/document_uploader.py | 26 - .../src/admin_api_lib/apis/admin_api.py | 15 +- .../src/admin_api_lib/apis/admin_api_base.py | 1 + .../src/admin_api_lib/dependency_container.py | 29 +- .../extractor_api_client.py | 50 ++ .../{openapi_client => }/models/__init__.py | 6 +- .../models/content_type.py | 0 .../models/information_piece.py | 4 +- .../models/key_value_pair.py | 0 .../openapi_client/__init__.py | 36 - .../openapi_client/api/__init__.py | 4 - .../openapi_client/api/extractor_api.py | 323 -------- .../openapi_client/api_client.py | 695 ------------------ .../openapi_client/api_response.py | 20 - .../openapi_client/configuration.py | 460 ------------ .../openapi_client/exceptions.py | 197 ----- .../models/confluence_parameters.py | 137 ---- .../models/extraction_request.py | 101 --- .../openapi_client/rest.py | 209 ------ .../openapi_client/test/__init__.py | 0 .../openapi_client/test/test_content_type.py | 35 - .../test/test_extraction_request.py | 58 -- .../openapi_client/test/test_extractor_api.py | 35 - .../test/test_information_piece.py | 62 -- .../test/test_key_value_pair.py | 54 -- .../admin_api_lib/file_services/__init__.py | 0 .../src/admin_api_lib/impl/admin_api.py | 5 +- .../default_confluence_loader.py | 195 ----- .../default_document_uploader.py | 192 ----- .../api_endpoints/default_source_uploader.py | 35 +- .../impl/mapper/confluence_settings_mapper.py | 36 - .../impl/mapper/informationpiece2document.py | 4 +- .../impl/settings/confluence_settings.py | 170 ----- extractor-api-lib/openapi.yaml | 3 +- .../extractor_api_lib/apis/extractor_api.py | 2 +- .../extractors/information_file_extractor.py | 6 +- .../file_extractors/ms_docs_extractor.py | 6 +- .../file_extractors/pdf_extractor.py | 8 +- .../file_extractors/xml_extractor.py | 4 +- .../impl/extractors/general_file_extractor.py | 4 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 5 + 44 files changed, 125 insertions(+), 3129 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py delete mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/__init__.py (54%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/content_type.py (100%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/information_piece.py (94%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/key_value_pair.py (100%) delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py delete mode 100644 admin-api-lib/src/admin_api_lib/file_services/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py diff --git a/admin-api-lib/poetry.lock b/admin-api-lib/poetry.lock index 671adcc..bd12f09 100644 --- a/admin-api-lib/poetry.lock +++ b/admin-api-lib/poetry.lock @@ -3693,4 +3693,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" +content-hash = "f34effb5fa2b12b05da69ca28c62764dc2017a2a2a9336b5265428005004e7ec" diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index ec0de57..d7a995f 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -107,6 +107,7 @@ langfuse = "^2.60.4" redis = "^6.0.0" pyyaml = "^6.0.2" python-multipart = "^0.0.20" +requests-toolbelt = "^1.0.0" [tool.pytest.ini_options] log_cli = 1 diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py deleted file mode 100644 index 06d79be..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Module for ConfluenceLoader abstract base class.""" - -from abc import ABC, abstractmethod - - -class ConfluenceLoader(ABC): - """Abstract base class for the confluence loader endpoint.""" - - @abstractmethod - async def aload_from_confluence(self) -> None: - """ - Load data from Confluence asynchronously. - - This method should be implemented to load data asynchronously from Confluence. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py deleted file mode 100644 index 9a3e70b..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the DocumentUploader abstract base class.""" - -from abc import ABC, abstractmethod - -from fastapi import Request, UploadFile - - -class DocumentUploader(ABC): - """Abstract base class for document upload endpoint.""" - - @abstractmethod - async def aupload_documents_post(self, body: UploadFile, request: Request) -> None: - """ - Upload documents asynchronously, currently supported formats are: PDF, DOCX, XML, PPTX. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index ccaed84..9d32286 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Dict, List # noqa: F401 +from typing import Dict, List, Annotated # noqa: F401 import importlib import pkgutil from admin_api_lib.apis.admin_api_base import BaseAdminApi -from fastapi import APIRouter, Path, Request, Response, UploadFile # noqa: F401 +from fastapi import APIRouter, Path, Request, Response, UploadFile, Form # noqa: F401 import admin_api_lib.impl @@ -135,12 +135,13 @@ async def get_all_documents_status() -> List[DocumentStatus]: response_model_by_alias=True, ) async def upload_source( - type: StrictStr = Form(None, description=""), - name: StrictStr = Form(None, description=""), - file: Optional[UploadFile] = Form(None, description=""), - kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), + request: Request, + type: Annotated[str, Form()], + name: Annotated[str, Form()], + file: Optional[UploadFile] = None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, ) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs, request) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 48e22dc..8aebb8b 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -70,6 +70,7 @@ async def upload_source( name: StrictStr, file: Optional[UploadFile], kwargs: Optional[List[KeyValuePair]], + request: Request, ) -> None: """Uploads user selected sources.""" ... diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 4ca3b57..93b3ab2 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,6 +1,5 @@ """Module for the DependencyContainer class.""" -from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -12,25 +11,15 @@ from langchain_community.llms import Ollama, VLLMOpenAI from langfuse import Langfuse -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration as ExtractorConfiguration, -) -from admin_api_lib.impl.api_endpoints.default_confluence_loader import ( - DefaultConfluenceLoader, -) +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.impl.api_endpoints.default_document_deleter import ( DefaultDocumentDeleter, ) from admin_api_lib.impl.api_endpoints.default_document_reference_retriever import ( DefaultDocumentReferenceRetriever, ) -from admin_api_lib.impl.api_endpoints.default_document_uploader import ( - DefaultDocumentUploader, -) + from admin_api_lib.impl.api_endpoints.default_documents_status_retriever import ( DefaultDocumentsStatusRetriever, ) @@ -43,14 +32,10 @@ from admin_api_lib.impl.key_db.file_status_key_value_store import ( FileStatusKeyValueStore, ) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) from admin_api_lib.impl.mapper.informationpiece2document import ( InformationPiece2Document, ) from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings from admin_api_lib.impl.settings.document_extractor_settings import ( DocumentExtractorSettings, ) @@ -93,7 +78,6 @@ class DependencyContainer(DeclarativeContainer): rag_api_settings = RAGAPISettings() key_value_store_settings = KeyValueSettings() summarizer_settings = SummarizerSettings() - confluence_settings = ConfluenceSettings() key_value_store = Singleton(FileStatusKeyValueStore, key_value_store_settings) file_service = Singleton(S3Service, s3_settings=s3_settings) @@ -103,16 +87,13 @@ class DependencyContainer(DeclarativeContainer): ) chunker = Singleton(TextChunker, text_splitter) - extractor_api_configuration = Singleton(ExtractorConfiguration, host=document_extractor_settings.host) - document_extractor_api_client = Singleton(ApiClient, extractor_api_configuration) - document_extractor = Singleton(ExtractorApi, document_extractor_api_client) + document_extractor = Singleton(ExtractorApiClient, document_extractor_settings.host) rag_api_configuration = Singleton(RagConfiguration, host=rag_api_settings.host) rag_api_client = Singleton(RagApiClient, configuration=rag_api_configuration) rag_api = Singleton(RagApi, rag_api_client) information_mapper = Singleton(InformationPiece2Document) - confluence_settings_mapper = Singleton(ConfluenceSettingsMapper) large_language_model = Selector( class_selector_config.llm_type, @@ -165,7 +146,7 @@ class DependencyContainer(DeclarativeContainer): DefaultDocumentDeleter, rag_api=rag_api, file_service=file_service, key_value_store=key_value_store ) documents_status_retriever = Singleton(DefaultDocumentsStatusRetriever, key_value_store=key_value_store) - + document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) source_uploader = Singleton( diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py new file mode 100644 index 0000000..78ccbf7 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py @@ -0,0 +1,50 @@ +import requests +from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece +from requests_toolbelt.multipart import MultipartEncoder + + +class ExtractorApiClient: + def __init__(self, base_url): + """ + Initialize the client with the base URL of the API. + + Args: + base_url (str): The base URL of the API. + """ + self.base_url = base_url + + def extract(self, type, name, file, kwargs=None): + """ + Send an extraction request to the API. + + Args: + file (str): The path to the file to extract from. + name (str): The name of the extraction request. + type (str): The type of extraction to perform. + kwargs (list): A list of key-value pairs to pass as additional arguments. + + Returns: + list: A list of extracted information pieces. + """ + with open(file, "rb") as openfile: + url = self.base_url + "/extract" + encoder = MultipartEncoder( + fields={ + "file": (file, openfile, "application/octet-stream"), + "name": name, + "type": type, + } + ) + if kwargs: + for pair in kwargs: + encoder.add_field(pair["key"], pair["value"]) + response = requests.post(url, headers={"Content-Type": encoder.content_type}, data=encoder) + if response.status_code == 200: + response_json = response.json() + return [InformationPiece.from_dict(x) for x in response_json] + elif response.status_code == 422: + raise ValueError("Invalid source") + elif response.status_code == 500: + raise Exception("Internal server error") + else: + raise Exception("Unknown error") diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py similarity index 54% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py index e0ef19f..53560b6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py @@ -14,6 +14,6 @@ # import models into model package -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py similarity index 94% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py index a428183..99c3ee2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py @@ -19,8 +19,8 @@ from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair from typing import Optional, Set from typing_extensions import Self diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py deleted file mode 100644 index f43e4e9..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf-8 - -# flake8: noqa - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -__version__ = "1.0.0" - -# import apis into sdk package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - -# import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration -from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException - -# import models into sdk package -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py deleted file mode 100644 index c95ce65..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa - -# import apis into api package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py deleted file mode 100644 index 1a862d3..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ /dev/null @@ -1,323 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - -import warnings -from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt -from typing import Any, Dict, List, Optional, Tuple, Union -from typing_extensions import Annotated - -from pydantic import StrictBytes, StrictStr -from typing import List, Optional, Tuple, Union -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair - -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType - - -class ExtractorApi: - """NOTE: This class is auto generated by OpenAPI Generator - Ref: https://openapi-generator.tech - - Do not edit the class manually. - """ - - def __init__(self, api_client=None) -> None: - if api_client is None: - api_client = ApiClient.get_default() - self.api_client = api_client - - @validate_call - def extract( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> List[InformationPiece]: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ).data - - @validate_call - def extract_with_http_info( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> ApiResponse[List[InformationPiece]]: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ) - - @validate_call - def extract_without_preload_content( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> RESTResponseType: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - return response_data.response - - def _extract_serialize( - self, - type, - name, - file, - kwargs, - _request_auth, - _content_type, - _headers, - _host_index, - ) -> RequestSerialized: - - _host = None - - _collection_formats: Dict[str, str] = { - "kwargs": "csv", - } - - _path_params: Dict[str, str] = {} - _query_params: List[Tuple[str, str]] = [] - _header_params: Dict[str, Optional[str]] = _headers or {} - _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} - _body_params: Optional[bytes] = None - - # process the path parameters - # process the query parameters - # process the header parameters - # process the form parameters - if file is not None: - _files["file"] = file - if type is not None: - _form_params.append(("type", type)) - if kwargs is not None: - _form_params.append(("kwargs", kwargs)) - if name is not None: - _form_params.append(("name", name)) - # process the body parameter - - # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) - - # set the HTTP header `Content-Type` - if _content_type: - _header_params["Content-Type"] = _content_type - else: - _default_content_type = self.api_client.select_header_content_type(["multipart/form-data"]) - if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type - - # authentication setting - _auth_settings: List[str] = [] - - return self.api_client.param_serialize( - method="POST", - resource_path="/extract", - path_params=_path_params, - query_params=_query_params, - header_params=_header_params, - body=_body_params, - post_params=_form_params, - files=_files, - auth_settings=_auth_settings, - collection_formats=_collection_formats, - _host=_host, - _request_auth=_request_auth, - ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py deleted file mode 100644 index ba8f5d2..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ /dev/null @@ -1,695 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import datetime -from dateutil.parser import parse -from enum import Enum -import decimal -import json -import mimetypes -import os -import re -import tempfile - -from urllib.parse import quote -from typing import Tuple, Optional, List, Dict, Union -from pydantic import SecretStr - -from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT -import admin_api_lib.extractor_api_client.openapi_client.models -from admin_api_lib.extractor_api_client.openapi_client import rest -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiValueError, - ApiException, - BadRequestException, - UnauthorizedException, - ForbiddenException, - NotFoundException, - ServiceException, -) - -RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] - - -class ApiClient: - """Generic API client for OpenAPI client library builds. - - OpenAPI generic API client. This client handles the client- - server communication, and is invariant across implementations. Specifics of - the methods and models for each application are generated from the OpenAPI - templates. - - :param configuration: .Configuration object for this client - :param header_name: a header to pass when making calls to the API. - :param header_value: a header value to pass when making calls to - the API. - :param cookie: a cookie to include in the header when making calls - to the API - """ - - PRIMITIVE_TYPES = (float, bool, bytes, str, int) - NATIVE_TYPES_MAPPING = { - "int": int, - "long": int, # TODO remove as only py3 is supported? - "float": float, - "str": str, - "bool": bool, - "date": datetime.date, - "datetime": datetime.datetime, - "decimal": decimal.Decimal, - "object": object, - } - _pool = None - - def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: - # use default configuration if none is provided - if configuration is None: - configuration = Configuration.get_default() - self.configuration = configuration - - self.rest_client = rest.RESTClientObject(configuration) - self.default_headers = {} - if header_name is not None: - self.default_headers[header_name] = header_value - self.cookie = cookie - # Set default User-Agent. - self.user_agent = "OpenAPI-Generator/1.0.0/python" - self.client_side_validation = configuration.client_side_validation - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - pass - - @property - def user_agent(self): - """User agent for this API client""" - return self.default_headers["User-Agent"] - - @user_agent.setter - def user_agent(self, value): - self.default_headers["User-Agent"] = value - - def set_default_header(self, header_name, header_value): - self.default_headers[header_name] = header_value - - _default = None - - @classmethod - def get_default(cls): - """Return new instance of ApiClient. - - This method returns newly created, based on default constructor, - object of ApiClient class or returns a copy of default - ApiClient. - - :return: The ApiClient object. - """ - if cls._default is None: - cls._default = ApiClient() - return cls._default - - @classmethod - def set_default(cls, default): - """Set default instance of ApiClient. - - It stores default ApiClient. - - :param default: object of ApiClient. - """ - cls._default = default - - def param_serialize( - self, - method, - resource_path, - path_params=None, - query_params=None, - header_params=None, - body=None, - post_params=None, - files=None, - auth_settings=None, - collection_formats=None, - _host=None, - _request_auth=None, - ) -> RequestSerialized: - """Builds the HTTP request params needed by the request. - :param method: Method to call. - :param resource_path: Path to method endpoint. - :param path_params: Path parameters in the url. - :param query_params: Query parameters in the url. - :param header_params: Header parameters to be - placed in the request header. - :param body: Request body. - :param post_params dict: Request post form parameters, - for `application/x-www-form-urlencoded`, `multipart/form-data`. - :param auth_settings list: Auth Settings names for the request. - :param files dict: key -> filename, value -> filepath, - for `multipart/form-data`. - :param collection_formats: dict of collection formats for path, query, - header, and post parameters. - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the authentication - in the spec for a single request. - :return: tuple of form (path, http_method, query_params, header_params, - body, post_params, files) - """ - - config = self.configuration - - # header parameters - header_params = header_params or {} - header_params.update(self.default_headers) - if self.cookie: - header_params["Cookie"] = self.cookie - if header_params: - header_params = self.sanitize_for_serialization(header_params) - header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) - - # path parameters - if path_params: - path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples(path_params, collection_formats) - for k, v in path_params: - # specified safe chars, encode everything - resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) - - # post parameters - if post_params or files: - post_params = post_params if post_params else [] - post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples(post_params, collection_formats) - if files: - post_params.extend(self.files_parameters(files)) - - # auth setting - self.update_params_for_auth( - header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth - ) - - # body - if body: - body = self.sanitize_for_serialization(body) - - # request url - if _host is None or self.configuration.ignore_operation_servers: - url = self.configuration.host + resource_path - else: - # use server/host defined in path or operation instead - url = _host + resource_path - - # query parameters - if query_params: - query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query(query_params, collection_formats) - url += "?" + url_query - - return method, url, header_params, body, post_params - - def call_api( - self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None - ) -> rest.RESTResponse: - """Makes the HTTP request (synchronous) - :param method: Method to call. - :param url: Path to method endpoint. - :param header_params: Header parameters to be - placed in the request header. - :param body: Request body. - :param post_params dict: Request post form parameters, - for `application/x-www-form-urlencoded`, `multipart/form-data`. - :param _request_timeout: timeout setting for this request. - :return: RESTResponse - """ - - try: - # perform request and return response - response_data = self.rest_client.request( - method, - url, - headers=header_params, - body=body, - post_params=post_params, - _request_timeout=_request_timeout, - ) - - except ApiException as e: - raise e - - return response_data - - def response_deserialize( - self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None - ) -> ApiResponse[ApiResponseT]: - """Deserializes response into an object. - :param response_data: RESTResponse object to be deserialized. - :param response_types_map: dict of response types. - :return: ApiResponse - """ - - msg = "RESTResponse.read() must be called before passing it to response_deserialize()" - assert response_data.data is not None, msg - - response_type = response_types_map.get(str(response_data.status), None) - if not response_type and isinstance(response_data.status, int) and 100 <= response_data.status <= 599: - # if not found, look for '1XX', '2XX', etc. - response_type = response_types_map.get(str(response_data.status)[0] + "XX", None) - - # deserialize response data - response_text = None - return_data = None - try: - if response_type == "bytearray": - return_data = response_data.data - elif response_type == "file": - return_data = self.__deserialize_file(response_data) - elif response_type is not None: - match = None - content_type = response_data.getheader("content-type") - if content_type is not None: - match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) - encoding = match.group(1) if match else "utf-8" - response_text = response_data.data.decode(encoding) - return_data = self.deserialize(response_text, response_type, content_type) - finally: - if not 200 <= response_data.status <= 299: - raise ApiException.from_response( - http_resp=response_data, - body=response_text, - data=return_data, - ) - - return ApiResponse( - status_code=response_data.status, - data=return_data, - headers=response_data.getheaders(), - raw_data=response_data.data, - ) - - def sanitize_for_serialization(self, obj): - """Builds a JSON POST object. - - If obj is None, return None. - If obj is SecretStr, return obj.get_secret_value() - If obj is str, int, long, float, bool, return directly. - If obj is datetime.datetime, datetime.date - convert to string in iso8601 format. - If obj is decimal.Decimal return string representation. - If obj is list, sanitize each element in the list. - If obj is dict, return the dict. - If obj is OpenAPI model, return the properties dict. - - :param obj: The data to serialize. - :return: The serialized form of data. - """ - if obj is None: - return None - elif isinstance(obj, Enum): - return obj.value - elif isinstance(obj, SecretStr): - return obj.get_secret_value() - elif isinstance(obj, self.PRIMITIVE_TYPES): - return obj - elif isinstance(obj, list): - return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] - elif isinstance(obj, tuple): - return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) - elif isinstance(obj, (datetime.datetime, datetime.date)): - return obj.isoformat() - elif isinstance(obj, decimal.Decimal): - return str(obj) - - elif isinstance(obj, dict): - obj_dict = obj - else: - # Convert model obj to dict except - # attributes `openapi_types`, `attribute_map` - # and attributes which value is not None. - # Convert attribute name to json key in - # model definition for request. - if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): - obj_dict = obj.to_dict() - else: - obj_dict = obj.__dict__ - - return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} - - def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): - """Deserializes response into an object. - - :param response: RESTResponse object to be deserialized. - :param response_type: class literal for - deserialized object, or string of class name. - :param content_type: content type of response. - - :return: deserialized object. - """ - - # fetch data from response object - if content_type is None: - try: - data = json.loads(response_text) - except ValueError: - data = response_text - elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): - if response_text == "": - data = "" - else: - data = json.loads(response_text) - elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): - data = response_text - else: - raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) - - return self.__deserialize(data, response_type) - - def __deserialize(self, data, klass): - """Deserializes dict, list, str into an object. - - :param data: dict, list or str. - :param klass: class literal, or string of class name. - - :return: object. - """ - if data is None: - return None - - if isinstance(klass, str): - if klass.startswith("List["): - m = re.match(r"List\[(.*)]", klass) - assert m is not None, "Malformed List type definition" - sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) for sub_data in data] - - if klass.startswith("Dict["): - m = re.match(r"Dict\[([^,]*), (.*)]", klass) - assert m is not None, "Malformed Dict type definition" - sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} - - # convert str to class - if klass in self.NATIVE_TYPES_MAPPING: - klass = self.NATIVE_TYPES_MAPPING[klass] - else: - klass = getattr(admin_api_lib.extractor_api_client.openapi_client.models, klass) - - if klass in self.PRIMITIVE_TYPES: - return self.__deserialize_primitive(data, klass) - elif klass == object: - return self.__deserialize_object(data) - elif klass == datetime.date: - return self.__deserialize_date(data) - elif klass == datetime.datetime: - return self.__deserialize_datetime(data) - elif klass == decimal.Decimal: - return decimal.Decimal(data) - elif issubclass(klass, Enum): - return self.__deserialize_enum(data, klass) - else: - return self.__deserialize_model(data, klass) - - def parameters_to_tuples(self, params, collection_formats): - """Get parameters as list of tuples, formatting collections. - - :param params: Parameters as dict or list of two-tuples - :param dict collection_formats: Parameter collection formats - :return: Parameters as list of tuples, collections formatted - """ - new_params: List[Tuple[str, str]] = [] - if collection_formats is None: - collection_formats = {} - for k, v in params.items() if isinstance(params, dict) else params: - if k in collection_formats: - collection_format = collection_formats[k] - if collection_format == "multi": - new_params.extend((k, value) for value in v) - else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" - else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(str(value) for value in v))) - else: - new_params.append((k, v)) - return new_params - - def parameters_to_url_query(self, params, collection_formats): - """Get parameters as list of tuples, formatting collections. - - :param params: Parameters as dict or list of two-tuples - :param dict collection_formats: Parameter collection formats - :return: URL query string (e.g. a=Hello%20World&b=123) - """ - new_params: List[Tuple[str, str]] = [] - if collection_formats is None: - collection_formats = {} - for k, v in params.items() if isinstance(params, dict) else params: - if isinstance(v, bool): - v = str(v).lower() - if isinstance(v, (int, float)): - v = str(v) - if isinstance(v, dict): - v = json.dumps(v) - - if k in collection_formats: - collection_format = collection_formats[k] - if collection_format == "multi": - new_params.extend((k, str(value)) for value in v) - else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" - else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(quote(str(value)) for value in v))) - else: - new_params.append((k, quote(str(v)))) - - return "&".join(["=".join(map(str, item)) for item in new_params]) - - def files_parameters( - self, - files: Dict[str, Union[str, bytes, List[str], List[bytes], Tuple[str, bytes]]], - ): - """Builds form parameters. - - :param files: File parameters. - :return: Form parameters with files. - """ - params = [] - for k, v in files.items(): - if isinstance(v, str): - with open(v, "rb") as f: - filename = os.path.basename(f.name) - filedata = f.read() - elif isinstance(v, bytes): - filename = k - filedata = v - elif isinstance(v, tuple): - filename, filedata = v - elif isinstance(v, list): - for file_param in v: - params.extend(self.files_parameters({k: file_param})) - continue - else: - raise ValueError("Unsupported file value") - mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" - params.append(tuple([k, tuple([filename, filedata, mimetype])])) - return params - - def select_header_accept(self, accepts: List[str]) -> Optional[str]: - """Returns `Accept` based on an array of accepts provided. - - :param accepts: List of headers. - :return: Accept (e.g. application/json). - """ - if not accepts: - return None - - for accept in accepts: - if re.search("json", accept, re.IGNORECASE): - return accept - - return accepts[0] - - def select_header_content_type(self, content_types): - """Returns `Content-Type` based on an array of content_types provided. - - :param content_types: List of content-types. - :return: Content-Type (e.g. application/json). - """ - if not content_types: - return None - - for content_type in content_types: - if re.search("json", content_type, re.IGNORECASE): - return content_type - - return content_types[0] - - def update_params_for_auth( - self, headers, queries, auth_settings, resource_path, method, body, request_auth=None - ) -> None: - """Updates header and query params based on authentication setting. - - :param headers: Header parameters dict to be updated. - :param queries: Query parameters tuple list to be updated. - :param auth_settings: Authentication setting identifiers list. - :resource_path: A string representation of the HTTP request resource path. - :method: A string representation of the HTTP request method. - :body: A object representing the body of the HTTP request. - The object type is the return value of sanitize_for_serialization(). - :param request_auth: if set, the provided settings will - override the token in the configuration. - """ - if not auth_settings: - return - - if request_auth: - self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) - else: - for auth in auth_settings: - auth_setting = self.configuration.auth_settings().get(auth) - if auth_setting: - self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) - - def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: - """Updates the request parameters based on a single auth_setting - - :param headers: Header parameters dict to be updated. - :param queries: Query parameters tuple list to be updated. - :resource_path: A string representation of the HTTP request resource path. - :method: A string representation of the HTTP request method. - :body: A object representing the body of the HTTP request. - The object type is the return value of sanitize_for_serialization(). - :param auth_setting: auth settings for the endpoint - """ - if auth_setting["in"] == "cookie": - headers["Cookie"] = auth_setting["value"] - elif auth_setting["in"] == "header": - if auth_setting["type"] != "http-signature": - headers[auth_setting["key"]] = auth_setting["value"] - elif auth_setting["in"] == "query": - queries.append((auth_setting["key"], auth_setting["value"])) - else: - raise ApiValueError("Authentication token must be in `query` or `header`") - - def __deserialize_file(self, response): - """Deserializes body to file - - Saves response body into a file in a temporary folder, - using the filename from the `Content-Disposition` header if provided. - - handle file downloading - save response body into a tmp file and return the instance - - :param response: RESTResponse. - :return: file path. - """ - fd, path = tempfile.mkstemp(dir=self.configuration.temp_folder_path) - os.close(fd) - os.remove(path) - - content_disposition = response.getheader("Content-Disposition") - if content_disposition: - m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) - assert m is not None, "Unexpected 'content-disposition' header value" - filename = m.group(1) - path = os.path.join(os.path.dirname(path), filename) - - with open(path, "wb") as f: - f.write(response.data) - - return path - - def __deserialize_primitive(self, data, klass): - """Deserializes string to primitive type. - - :param data: str. - :param klass: class literal. - - :return: int, long, float, str, bool. - """ - try: - return klass(data) - except UnicodeEncodeError: - return str(data) - except TypeError: - return data - - def __deserialize_object(self, value): - """Return an original value. - - :return: object. - """ - return value - - def __deserialize_date(self, string): - """Deserializes string to date. - - :param string: str. - :return: date. - """ - try: - return parse(string).date() - except ImportError: - return string - except ValueError: - raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) - - def __deserialize_datetime(self, string): - """Deserializes string to datetime. - - The string should be in iso8601 datetime format. - - :param string: str. - :return: datetime. - """ - try: - return parse(string) - except ImportError: - return string - except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) - - def __deserialize_enum(self, data, klass): - """Deserializes primitive type to enum. - - :param data: primitive type. - :param klass: class literal. - :return: enum value. - """ - try: - return klass(data) - except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) - - def __deserialize_model(self, data, klass): - """Deserializes list or dict to model. - - :param data: dict, list. - :param klass: class literal. - :return: model object. - """ - - return klass.from_dict(data) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py deleted file mode 100644 index 1ce1372..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ /dev/null @@ -1,20 +0,0 @@ -"""API response object.""" - -from __future__ import annotations -from typing import Optional, Generic, Mapping, TypeVar -from pydantic import Field, StrictInt, StrictBytes, BaseModel - -T = TypeVar("T") - - -class ApiResponse(BaseModel, Generic[T]): - """ - API response object - """ - - status_code: StrictInt = Field(description="HTTP status code") - headers: Optional[Mapping[str, str]] = Field(None, description="HTTP headers") - data: T = Field(description="Deserialized data given the data type") - raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - - model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py deleted file mode 100644 index 2e80369..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ /dev/null @@ -1,460 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import copy -import logging -from logging import FileHandler -import multiprocessing -import sys -from typing import Optional -import urllib3 - -import http.client as httplib - -JSON_SCHEMA_VALIDATION_KEYWORDS = { - "multipleOf", - "maximum", - "exclusiveMaximum", - "minimum", - "exclusiveMinimum", - "maxLength", - "minLength", - "pattern", - "maxItems", - "minItems", -} - - -class Configuration: - """This class contains various settings of the API client. - - :param host: Base url. - :param ignore_operation_servers - Boolean to ignore operation servers for the API client. - Config will use `host` as the base url regardless of the operation servers. - :param api_key: Dict to store API key(s). - Each entry in the dict specifies an API key. - The dict key is the name of the security scheme in the OAS specification. - The dict value is the API key secret. - :param api_key_prefix: Dict to store API prefix (e.g. Bearer). - The dict key is the name of the security scheme in the OAS specification. - The dict value is an API key prefix when generating the auth data. - :param username: Username for HTTP basic authentication. - :param password: Password for HTTP basic authentication. - :param access_token: Access token. - :param server_index: Index to servers configuration. - :param server_variables: Mapping with string values to replace variables in - templated server configuration. The validation of enums is performed for - variables with defined enum values before. - :param server_operation_index: Mapping from operation ID to an index to server - configuration. - :param server_operation_variables: Mapping from operation ID to a mapping with - string values to replace variables in templated server configuration. - The validation of enums is performed for variables with defined enum - values before. - :param ssl_ca_cert: str - the path to a file of concatenated CA certificates - in PEM format. - :param retries: Number of retries for API requests. - - """ - - _default = None - - def __init__( - self, - host=None, - api_key=None, - api_key_prefix=None, - username=None, - password=None, - access_token=None, - server_index=None, - server_variables=None, - server_operation_index=None, - server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor""" - self._base_path = "http://localhost" if host is None else host - """Default Base url - """ - self.server_index = 0 if server_index is None and host is None else server_index - self.server_operation_index = server_operation_index or {} - """Default server index - """ - self.server_variables = server_variables or {} - self.server_operation_variables = server_operation_variables or {} - """Default server variables - """ - self.ignore_operation_servers = ignore_operation_servers - """Ignore operation servers - """ - self.temp_folder_path = None - """Temp file folder for downloading files - """ - # Authentication Settings - self.api_key = {} - if api_key: - self.api_key = api_key - """dict to store API key(s) - """ - self.api_key_prefix = {} - if api_key_prefix: - self.api_key_prefix = api_key_prefix - """dict to store API prefix (e.g. Bearer) - """ - self.refresh_api_key_hook = None - """function hook to refresh API key if expired - """ - self.username = username - """Username for HTTP basic authentication - """ - self.password = password - """Password for HTTP basic authentication - """ - self.access_token = access_token - """Access token - """ - self.logger = {} - """Logging Settings - """ - self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") - self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = "%(asctime)s %(levelname)s %(message)s" - """Log format - """ - self.logger_stream_handler = None - """Log stream handler - """ - self.logger_file_handler: Optional[FileHandler] = None - """Log file handler - """ - self.logger_file = None - """Debug file location - """ - if debug is not None: - self.debug = debug - else: - self.__debug = False - """Debug switch - """ - - self.verify_ssl = True - """SSL/TLS verification - Set this to false to skip verifying SSL certificate when calling API - from https server. - """ - self.ssl_ca_cert = ssl_ca_cert - """Set this to customize the certificate file to verify the peer. - """ - self.cert_file = None - """client certificate file - """ - self.key_file = None - """client key file - """ - self.assert_hostname = None - """Set this to True/False to enable/disable SSL hostname verification. - """ - self.tls_server_name = None - """SSL/TLS Server Name Indication (SNI) - Set this to the SNI value expected by the server. - """ - - self.connection_pool_maxsize = multiprocessing.cpu_count() * 5 - """urllib3 connection pool's maximum number of connections saved - per pool. urllib3 uses 1 connection as default value, but this is - not the best value when you are making a lot of possibly parallel - requests to the same host, which is often the case here. - cpu_count * 5 is used as default value to increase performance. - """ - - self.proxy: Optional[str] = None - """Proxy URL - """ - self.proxy_headers = None - """Proxy headers - """ - self.safe_chars_for_path_param = "" - """Safe chars for path_param - """ - self.retries = retries - """Adding retries to override urllib3 default value 3 - """ - # Enable client side validation - self.client_side_validation = True - - self.socket_options = None - """Options to pass down to the underlying urllib3 socket - """ - - self.datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" - """datetime format - """ - - self.date_format = "%Y-%m-%d" - """date format - """ - - def __deepcopy__(self, memo): - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - for k, v in self.__dict__.items(): - if k not in ("logger", "logger_file_handler"): - setattr(result, k, copy.deepcopy(v, memo)) - # shallow copy of loggers - result.logger = copy.copy(self.logger) - # use setters to configure loggers - result.logger_file = self.logger_file - result.debug = self.debug - return result - - def __setattr__(self, name, value): - object.__setattr__(self, name, value) - - @classmethod - def set_default(cls, default): - """Set default instance of configuration. - - It stores default configuration, which can be - returned by get_default_copy method. - - :param default: object of Configuration - """ - cls._default = default - - @classmethod - def get_default_copy(cls): - """Deprecated. Please use `get_default` instead. - - Deprecated. Please use `get_default` instead. - - :return: The configuration object. - """ - return cls.get_default() - - @classmethod - def get_default(cls): - """Return the default configuration. - - This method returns newly created, based on default constructor, - object of Configuration class or returns a copy of default - configuration. - - :return: The configuration object. - """ - if cls._default is None: - cls._default = Configuration() - return cls._default - - @property - def logger_file(self): - """The logger file. - - If the logger_file is None, then add stream handler and remove file - handler. Otherwise, add file handler and remove stream handler. - - :param value: The logger_file path. - :type: str - """ - return self.__logger_file - - @logger_file.setter - def logger_file(self, value): - """The logger file. - - If the logger_file is None, then add stream handler and remove file - handler. Otherwise, add file handler and remove stream handler. - - :param value: The logger_file path. - :type: str - """ - self.__logger_file = value - if self.__logger_file: - # If set logging file, - # then add file handler and remove stream handler. - self.logger_file_handler = logging.FileHandler(self.__logger_file) - self.logger_file_handler.setFormatter(self.logger_formatter) - for _, logger in self.logger.items(): - logger.addHandler(self.logger_file_handler) - - @property - def debug(self): - """Debug status - - :param value: The debug status, True or False. - :type: bool - """ - return self.__debug - - @debug.setter - def debug(self, value): - """Debug status - - :param value: The debug status, True or False. - :type: bool - """ - self.__debug = value - if self.__debug: - # if debug status is True, turn on debug logging - for _, logger in self.logger.items(): - logger.setLevel(logging.DEBUG) - # turn on httplib debug - httplib.HTTPConnection.debuglevel = 1 - else: - # if debug status is False, turn off debug logging, - # setting log level to default `logging.WARNING` - for _, logger in self.logger.items(): - logger.setLevel(logging.WARNING) - # turn off httplib debug - httplib.HTTPConnection.debuglevel = 0 - - @property - def logger_format(self): - """The logger format. - - The logger_formatter will be updated when sets logger_format. - - :param value: The format string. - :type: str - """ - return self.__logger_format - - @logger_format.setter - def logger_format(self, value): - """The logger format. - - The logger_formatter will be updated when sets logger_format. - - :param value: The format string. - :type: str - """ - self.__logger_format = value - self.logger_formatter = logging.Formatter(self.__logger_format) - - def get_api_key_with_prefix(self, identifier, alias=None): - """Gets API key (with prefix if set). - - :param identifier: The identifier of apiKey. - :param alias: The alternative identifier of apiKey. - :return: The token for api key authentication. - """ - if self.refresh_api_key_hook is not None: - self.refresh_api_key_hook(self) - key = self.api_key.get(identifier, self.api_key.get(alias) if alias is not None else None) - if key: - prefix = self.api_key_prefix.get(identifier) - if prefix: - return "%s %s" % (prefix, key) - else: - return key - - def get_basic_auth_token(self): - """Gets HTTP basic authentication header (string). - - :return: The token for basic HTTP authentication. - """ - username = "" - if self.username is not None: - username = self.username - password = "" - if self.password is not None: - password = self.password - return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") - - def auth_settings(self): - """Gets Auth Settings dict for api client. - - :return: The Auth Settings information dict. - """ - auth = {} - return auth - - def to_debug_report(self): - """Gets the essential information for debugging. - - :return: The report for debugging. - """ - return ( - "Python SDK Debug Report:\n" - "OS: {env}\n" - "Python Version: {pyversion}\n" - "Version of the API: 1.0.0\n" - "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) - ) - - def get_host_settings(self): - """Gets an array of host settings - - :return: An array of host settings - """ - return [ - { - "url": "", - "description": "No description provided", - } - ] - - def get_host_from_settings(self, index, variables=None, servers=None): - """Gets host URL based on the index and variables - :param index: array index of the host settings - :param variables: hash of variable and the corresponding value - :param servers: an array of host settings or None - :return: URL based on host settings - """ - if index is None: - return self._base_path - - variables = {} if variables is None else variables - servers = self.get_host_settings() if servers is None else servers - - try: - server = servers[index] - except IndexError: - raise ValueError( - "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers)) - ) - - url = server["url"] - - # go through variables and replace placeholders - for variable_name, variable in server.get("variables", {}).items(): - used_value = variables.get(variable_name, variable["default_value"]) - - if "enum_values" in variable and used_value not in variable["enum_values"]: - raise ValueError( - "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) - ) - - url = url.replace("{" + variable_name + "}", used_value) - - return url - - @property - def host(self): - """Return generated host.""" - return self.get_host_from_settings(self.server_index, variables=self.server_variables) - - @host.setter - def host(self, value): - """Fix base path.""" - self._base_path = value - self.server_index = None diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py deleted file mode 100644 index 5dbd4b0..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ /dev/null @@ -1,197 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - -from typing import Any, Optional -from typing_extensions import Self - - -class OpenApiException(Exception): - """The base exception class for all OpenAPIExceptions""" - - -class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: - """Raises an exception for TypeErrors - - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (list): a list of keys an indices to get to the - current_item - None if unset - valid_classes (tuple): the primitive classes that current item - should be an instance of - None if unset - key_type (bool): False if our value is a value in a dict - True if it is a key in a dict - False if our item is an item in a list - None if unset - """ - self.path_to_item = path_to_item - self.valid_classes = valid_classes - self.key_type = key_type - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiTypeError, self).__init__(full_msg) - - -class ApiValueError(OpenApiException, ValueError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (list) the path to the exception in the - received_data dict. None if unset - """ - - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiValueError, self).__init__(full_msg) - - -class ApiAttributeError(OpenApiException, AttributeError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Raised when an attribute reference or assignment fails. - - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (None/list) the path to the exception in the - received_data dict - """ - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiAttributeError, self).__init__(full_msg) - - -class ApiKeyError(OpenApiException, KeyError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (None/list) the path to the exception in the - received_data dict - """ - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiKeyError, self).__init__(full_msg) - - -class ApiException(OpenApiException): - - def __init__( - self, - status=None, - reason=None, - http_resp=None, - *, - body: Optional[str] = None, - data: Optional[Any] = None, - ) -> None: - self.status = status - self.reason = reason - self.body = body - self.data = data - self.headers = None - - if http_resp: - if self.status is None: - self.status = http_resp.status - if self.reason is None: - self.reason = http_resp.reason - if self.body is None: - try: - self.body = http_resp.data.decode("utf-8") - except Exception: - pass - self.headers = http_resp.getheaders() - - @classmethod - def from_response( - cls, - *, - http_resp, - body: Optional[str], - data: Optional[Any], - ) -> Self: - if http_resp.status == 400: - raise BadRequestException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 401: - raise UnauthorizedException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 403: - raise ForbiddenException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 404: - raise NotFoundException(http_resp=http_resp, body=body, data=data) - - if 500 <= http_resp.status <= 599: - raise ServiceException(http_resp=http_resp, body=body, data=data) - raise ApiException(http_resp=http_resp, body=body, data=data) - - def __str__(self): - """Custom error messages for exception""" - error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) - if self.headers: - error_message += "HTTP response headers: {0}\n".format(self.headers) - - if self.data or self.body: - error_message += "HTTP response body: {0}\n".format(self.data or self.body) - - return error_message - - -class BadRequestException(ApiException): - pass - - -class NotFoundException(ApiException): - pass - - -class UnauthorizedException(ApiException): - pass - - -class ForbiddenException(ApiException): - pass - - -class ServiceException(ApiException): - pass - - -def render_path(path_to_item): - """Returns a string representation of a path""" - result = "" - for pth in path_to_item: - if isinstance(pth, int): - result += "[{0}]".format(pth) - else: - result += "['{0}']".format(pth) - return result diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py deleted file mode 100644 index e24f0ad..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations - -import json -import pprint -import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set - -from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictStr -from typing import Any, ClassVar, Dict, List, Optional -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair -from typing import Optional, Set -from typing_extensions import Self - - -class ConfluenceParameters(BaseModel): - """ """ # noqa: E501 - - url: StrictStr = Field(description="url of the confluence space.") - token: StrictStr = Field(description="api key to access confluence.") - space_key: StrictStr = Field(description="the space key of the confluence pages.") - include_attachments: Optional[StrictBool] = Field( - default=False, - description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.", - ) - keep_markdown_format: Optional[StrictBool] = Field( - default=True, description="whether to preserve markdown formatting in the output. Default is `true`." - ) - keep_newlines: Optional[StrictBool] = Field( - default=True, - description="whether to retain newline characters in the output for better readability. Default is `true`.", - ) - document_name: StrictStr = Field( - description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." - ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field( - default=None, description="Additional kwargs like verify_ssl" - ) - __properties: ClassVar[List[str]] = [ - "url", - "token", - "space_key", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - "document_name", - "confluence_kwargs", - ] - - model_config = ConfigDict( - populate_by_name=True, - validate_assignment=True, - protected_namespaces=(), - ) - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) - - @classmethod - def from_json(cls, json_str: str) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - excluded_fields: Set[str] = set([]) - - _dict = self.model_dump( - by_alias=True, - exclude=excluded_fields, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) - _items = [] - if self.confluence_kwargs: - for _item_confluence_kwargs in self.confluence_kwargs: - if _item_confluence_kwargs: - _items.append(_item_confluence_kwargs.to_dict()) - _dict["confluence_kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "url": obj.get("url"), - "token": obj.get("token"), - "space_key": obj.get("space_key"), - "include_attachments": ( - obj.get("include_attachments") if obj.get("include_attachments") is not None else False - ), - "keep_markdown_format": ( - obj.get("keep_markdown_format") if obj.get("keep_markdown_format") is not None else True - ), - "keep_newlines": obj.get("keep_newlines") if obj.get("keep_newlines") is not None else True, - "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] - if obj.get("confluence_kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py deleted file mode 100644 index 4f9f9af..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations -import pprint -import re # noqa: F401 -import json - -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair -from typing import Optional, Set -from typing_extensions import Self - - -class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None - type: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] - - model_config = ConfigDict( - populate_by_name=True, - validate_assignment=True, - protected_namespaces=(), - ) - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - - @classmethod - def from_json(cls, json_str: str) -> Optional[Self]: - """Create an instance of ExtractionRequest from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - excluded_fields: Set[str] = set([]) - - _dict = self.model_dump( - by_alias=True, - exclude=excluded_fields, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item_kwargs in self.kwargs: - if _item_kwargs: - _items.append(_item_kwargs.to_dict()) - _dict["kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: - """Create an instance of ExtractionRequest from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] - if obj.get("kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py deleted file mode 100644 index 60fc660..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ /dev/null @@ -1,209 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import io -import json -import re -import ssl - -import urllib3 - -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError - -SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} -RESTResponseType = urllib3.HTTPResponse - - -def is_socks_proxy_url(url): - if url is None: - return False - split_section = url.split("://") - if len(split_section) < 2: - return False - else: - return split_section[0].lower() in SUPPORTED_SOCKS_PROXIES - - -class RESTResponse(io.IOBase): - - def __init__(self, resp) -> None: - self.response = resp - self.status = resp.status - self.reason = resp.reason - self.data = None - - def read(self): - if self.data is None: - self.data = self.response.data - return self.data - - def getheaders(self): - """Returns a dictionary of the response headers.""" - return self.response.headers - - def getheader(self, name, default=None): - """Returns a given response header.""" - return self.response.headers.get(name, default) - - -class RESTClientObject: - - def __init__(self, configuration) -> None: - # urllib3.PoolManager will pass all kw parameters to connectionpool - # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 - # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/connectionpool.py#L680 # noqa: E501 - # Custom SSL certificates and client certificates: http://urllib3.readthedocs.io/en/latest/advanced-usage.html # noqa: E501 - - # cert_reqs - if configuration.verify_ssl: - cert_reqs = ssl.CERT_REQUIRED - else: - cert_reqs = ssl.CERT_NONE - - pool_args = { - "cert_reqs": cert_reqs, - "ca_certs": configuration.ssl_ca_cert, - "cert_file": configuration.cert_file, - "key_file": configuration.key_file, - } - if configuration.assert_hostname is not None: - pool_args["assert_hostname"] = configuration.assert_hostname - - if configuration.retries is not None: - pool_args["retries"] = configuration.retries - - if configuration.tls_server_name: - pool_args["server_hostname"] = configuration.tls_server_name - - if configuration.socket_options is not None: - pool_args["socket_options"] = configuration.socket_options - - if configuration.connection_pool_maxsize is not None: - pool_args["maxsize"] = configuration.connection_pool_maxsize - - # https pool manager - self.pool_manager: urllib3.PoolManager - - if configuration.proxy: - if is_socks_proxy_url(configuration.proxy): - from urllib3.contrib.socks import SOCKSProxyManager - - pool_args["proxy_url"] = configuration.proxy - pool_args["headers"] = configuration.proxy_headers - self.pool_manager = SOCKSProxyManager(**pool_args) - else: - pool_args["proxy_url"] = configuration.proxy - pool_args["proxy_headers"] = configuration.proxy_headers - self.pool_manager = urllib3.ProxyManager(**pool_args) - else: - self.pool_manager = urllib3.PoolManager(**pool_args) - - def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): - """Perform requests. - - :param method: http request method - :param url: http request url - :param headers: http request headers - :param body: request json body, for `application/json` - :param post_params: request post parameters, - `application/x-www-form-urlencoded` - and `multipart/form-data` - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - """ - method = method.upper() - assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] - - if post_params and body: - raise ApiValueError("body parameter cannot be used with post_params parameter.") - - post_params = post_params or {} - headers = headers or {} - - timeout = None - if _request_timeout: - if isinstance(_request_timeout, (int, float)): - timeout = urllib3.Timeout(total=_request_timeout) - elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: - timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) - - try: - # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: - - # no content type provided or payload is json - content_type = headers.get("Content-Type") - if not content_type or re.search("json", content_type, re.IGNORECASE): - request_body = None - if body is not None: - request_body = json.dumps(body) - r = self.pool_manager.request( - method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False - ) - elif content_type == "application/x-www-form-urlencoded": - r = self.pool_manager.request( - method, - url, - fields=post_params, - encode_multipart=False, - timeout=timeout, - headers=headers, - preload_content=False, - ) - elif content_type == "multipart/form-data": - # must del headers['Content-Type'], or the correct - # Content-Type which generated by urllib3 will be - # overwritten. - del headers["Content-Type"] - # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] - r = self.pool_manager.request( - method, - url, - fields=post_params, - encode_multipart=True, - timeout=timeout, - headers=headers, - preload_content=False, - ) - # Pass a `string` parameter directly in the body to support - # other content types than JSON when `body` argument is - # provided in serialized form. - elif isinstance(body, str) or isinstance(body, bytes): - r = self.pool_manager.request( - method, url, body=body, timeout=timeout, headers=headers, preload_content=False - ) - elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): - request_body = "true" if body else "false" - r = self.pool_manager.request( - method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers - ) - else: - # Cannot generate the request from given parameters - msg = """Cannot prepare a request message for provided - arguments. Please check that your arguments match - declared content type.""" - raise ApiException(status=0, reason=msg) - # For `GET`, `HEAD` - else: - r = self.pool_manager.request( - method, url, fields={}, timeout=timeout, headers=headers, preload_content=False - ) - except urllib3.exceptions.SSLError as e: - msg = "\n".join([type(e).__name__, str(e)]) - raise ApiException(status=0, reason=msg) - - return RESTResponse(r) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py deleted file mode 100644 index 5a78d9b..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType - - -class TestContentType(unittest.TestCase): - """ContentType unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def testContentType(self): - """Test ContentType""" - # inst = ContentType() - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py deleted file mode 100644 index 2f8f1bd..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest - - -class TestExtractionRequest(unittest.TestCase): - """ExtractionRequest unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> ExtractionRequest: - """Test ExtractionRequest - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `ExtractionRequest` - """ - model = ExtractionRequest() - if include_optional: - return ExtractionRequest( - file = bytes(b'blah'), - type = '', - kwargs = [ - {"value":"value","key":"key"} - ] - ) - else: - return ExtractionRequest( - type = '', - ) - """ - - def testExtractionRequest(self): - """Test ExtractionRequest""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py deleted file mode 100644 index f39a507..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - - -class TestExtractorApi(unittest.TestCase): - """ExtractorApi unit test stubs""" - - def setUp(self) -> None: - self.api = ExtractorApi() - - def tearDown(self) -> None: - pass - - def test_extract_from_file_post(self) -> None: - """Test case for extract_from_file_post""" - pass - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py deleted file mode 100644 index 479c858..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece - - -class TestInformationPiece(unittest.TestCase): - """InformationPiece unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> InformationPiece: - """Test InformationPiece - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `InformationPiece` - """ - model = InformationPiece() - if include_optional: - return InformationPiece( - metadata = [ - {"value":"value","key":"key"} - ], - page_content = '', - type = 'IMAGE' - ) - else: - return InformationPiece( - metadata = [ - {"value":"value","key":"key"} - ], - page_content = '', - type = 'IMAGE', - ) - """ - - def testInformationPiece(self): - """Test InformationPiece""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py deleted file mode 100644 index 0ddc864..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +++ /dev/null @@ -1,54 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair - - -class TestKeyValuePair(unittest.TestCase): - """KeyValuePair unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> KeyValuePair: - """Test KeyValuePair - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `KeyValuePair` - """ - model = KeyValuePair() - if include_optional: - return KeyValuePair( - key = None, - value = None - ) - else: - return KeyValuePair( - ) - """ - - def testKeyValuePair(self): - """Test KeyValuePair""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/file_services/__init__.py b/admin-api-lib/src/admin_api_lib/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 25745c5..dd39f3c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,9 +1,7 @@ """Module containing the implementation of the Admin API.""" -from dataclasses import Field import logging from typing import List, Optional -from typing_extensions import Annotated from pydantic import Field, StrictBytes, StrictStr from admin_api_lib.api_endpoints.source_uploader import SourceUploader @@ -12,12 +10,11 @@ from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader + from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter from admin_api_lib.api_endpoints.document_reference_retriever import ( DocumentReferenceRetriever, ) -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader from admin_api_lib.api_endpoints.documents_status_retriever import ( DocumentsStatusRetriever, ) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py deleted file mode 100644 index 54fcfda..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Module for the DefaultConfluenceLoader class.""" - -import logging -from asyncio import run -from threading import Thread -import threading - -from fastapi import HTTPException, status -from langchain_core.documents import Document - -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultConfluenceLoader(ConfluenceLoader): - """ - DefaultConfluenceLoader is responsible for loading content from Confluence asynchronously. - - Attributes - ---------- - CONFLUENCE_SPACE : str - The Confluence space key. - """ - - CONFLUENCE_SPACE = "confluence_space" - - def __init__( - self, - extractor_api: ExtractorApi, - settings: ConfluenceSettings, - information_mapper: InformationPiece2Document, - rag_api: RagApi, - key_value_store: FileStatusKeyValueStore, - information_enhancer: InformationEnhancer, - chunker: Chunker, - document_deleter: DocumentDeleter, - settings_mapper: ConfluenceSettingsMapper, - ): - """ - Initialize the DefaultConfluenceLoader with the provided dependencies. - - Parameters - ---------- - extractor_api : ExtractorApi - The API for extracting information. - settings : ConfluenceSettings - The settings for Confluence. - information_mapper : InformationPiece2Document - The mapper for information pieces to langchain documents. - rag_api : RagApi - The API client for interacting with the RAG backend system. - key_value_store : FileStatusKeyValueStore - The key-value store to store file names and the corresponding file statuses. - information_enhancer : InformationEnhancer - The enhancer for information pieces. - chunker : Chunker - The chunker for breaking down documents into chunks. - document_deleter : DocumentDeleter - The deleter for documents from S3 Storage and Vector Database. - settings_mapper : ConfluenceSettingsMapper - The mapper to map the Confluence settings to confluence parameters. - """ - self._extractor_api = extractor_api - self._rag_api = rag_api - self._settings = settings - self._key_value_store = key_value_store - self._information_mapper = information_mapper - self._information_enhancer = information_enhancer - self._chunker = chunker - self._document_deleter = document_deleter - self._settings_mapper = settings_mapper - self._background_thread = None - self._document_key = None - - async def aload_from_confluence(self) -> None: - """ - Asynchronously loads content from Confluence using the configured settings. - - Raises - ------ - HTTPException - If the Confluence loader is not configured or if a load is already in progress. - """ - for index in range(len(self._settings.url)): - if not ( - self._settings.url[index].strip() - and self._settings.space_key[index].strip() - and self._settings.token[index].strip() - ): - raise HTTPException( - status.HTTP_501_NOT_IMPLEMENTED, - "The confluence loader is not configured! Required fields are missing.", - ) - - if self._background_thread is not None and self._background_thread.is_alive(): - raise HTTPException( - status.HTTP_423_LOCKED, "Confluence loader is locked... Please wait for the current load to finish." - ) - self._background_thread = Thread(target=lambda: run(self._aload_from_confluence())) - self._background_thread.start() - - async def _aload_from_confluence(self) -> None: - async def process_confluence(index): - logger.info("Loading from Confluence %s", self._settings.url[index]) - self._sanitize_document_name(index=index) - - params = self._settings_mapper.map_settings_to_params(self._settings, index) - try: - self._key_value_store.upsert(self._settings.document_name[index], Status.PROCESSING) - information_pieces = self._extractor_api.extract_from_confluence_post(params) - documents = [ - self._information_mapper.extractor_information_piece2document(x) for x in information_pieces - ] - documents = await self._aenhance_langchain_documents(documents) - chunked_documents = self._chunker.chunk(documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents - ] - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - - logger.error("Error while loading from Confluence: %s", str(e)) - raise HTTPException( - status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}" - ) from e - - await self._delete_previous_information_pieces(index=index) - self._key_value_store.upsert(self._settings.document_name[index], Status.UPLOADING) - self._upload_information_pieces(rag_information_pieces, index=index) - - threads = [] - for idx in range(len(self._settings.url)): - t = threading.Thread(target=lambda idx=idx: run(process_confluence(idx))) - threads.append(t) - t.start() - for t in threads: - t.join() - - async def _aenhance_langchain_documents(self, documents: list[Document]): - try: - return await self._information_enhancer.ainvoke(documents) - except Exception as e: - logger.error("Exception occured while enhancing confluence langchain document %s" % e) - raise e - - async def _delete_previous_information_pieces(self, index=0): - try: - await self._document_deleter.adelete_document(self._settings.document_name[index]) - except HTTPException as e: - logger.error( - ( - "Error while trying to delete documents with id: %s before uploading %s." - "NOTE: Still continuing with upload." - ), - self._settings.document_name[index], - e, - ) - - def _upload_information_pieces(self, rag_api_documents, index=0): - try: - self._rag_api.upload_information_piece(rag_api_documents) - self._key_value_store.upsert(self._settings.document_name[index], Status.READY) - logger.info("Confluence loaded successfully") - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - logger.error("Error while uploading Confluence to the database: %s", str(e)) - raise HTTPException(500, f"Error loading from Confluence: {str(e)}") from e - - def _sanitize_document_name(self, index) -> None: - document_name = ( - self._settings.document_name[index] if self._settings.document_name[index] else self._settings.url[index] - ) - document_name = document_name.replace("http://", "").replace("https://", "") - - self._settings.document_name[index] = sanitize_document_name(document_name) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py deleted file mode 100644 index 549be19..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Module for the DefaultDocumentUploader class.""" - -import logging -import tempfile -import traceback -import urllib -from asyncio import run -from pathlib import Path -from threading import Thread - -from fastapi import HTTPException, Request, UploadFile, status - -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.file_services.file_service import FileService -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultDocumentUploader(DocumentUploader): - """DefaultDocumentUploader is responsible for handling the upload, processing, and storage of documents.""" - - def __init__( - self, - document_extractor: ExtractorApi, - file_service: FileService, - rag_api: RagApi, - information_enhancer: InformationEnhancer, - information_mapper: InformationPiece2Document, - chunker: Chunker, - key_value_store: FileStatusKeyValueStore, - document_deleter: DocumentDeleter, - ): - """ - Initialize the DefaultDocumentUploader. - - Parameters - ---------- - document_extractor : ExtractorApi - The API for extracting documents. - file_service : FileService - The service for handling file operations on the S3 storage - rag_api : RagApi - The API for RAG backend. - information_enhancer : InformationEnhancer - The service for enhancing information. - information_mapper : InformationPiece2Document - The mapper for converting information pieces to langchain documents. - chunker : Chunker - The service for chunking documents into chunks. - key_value_store : FileStatusKeyValueStore - The key-value store for storing filename and the corresponding status. - document_deleter : DocumentDeleter - The service for deleting documents. - """ - self._document_extractor = document_extractor - self._file_service = file_service - self._rag_api = rag_api - self._information_enhancer = information_enhancer - self._information_mapper = information_mapper - self._chunker = chunker - self._key_value_store = key_value_store - self._document_deleter = document_deleter - self._background_threads = [] - - async def aupload_documents_post( - self, - body: UploadFile, - request: Request, - ) -> None: - """ - Handle the uploading of documents via a POST request. - - This asynchronous method reads the content of the uploaded file and starts a background - thread to save the document in S3 storage and the vector database. It updates the status - of the document in the key-value store and handles any exceptions that may occur during - the process. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Raises - ------ - HTTPException - If there is a ValueError, raises a 400 Bad Request error. - HTTPException - If there is any other exception, raises a 500 Internal Server Error. - """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] - content = await body.read() - body.filename = sanitize_document_name(body.filename) - try: - self._key_value_store.upsert(body.filename, Status.UPLOADING) - thread = Thread(target=lambda: run(self._asave_new_document(content, body.filename, request))) - thread.start() - self._background_threads.append(thread) - except ValueError as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) - except Exception as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) - - async def _asave_new_document( - self, - file_content: bytes, - filename: str, - request: Request, - ): - try: - await self._document_deleter.adelete_document(filename) - except HTTPException as e: - logger.error( - "Error while trying to delete file %s before uploading %s. Still continuing with upload.", filename, e - ) - self._key_value_store.upsert(filename, Status.ERROR) - - try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / filename - with open(temp_file_path, "wb") as temp_file: - logger.debug("Temporary file created at %s.", temp_file_path) - temp_file.write(file_content) - logger.debug("Temp file created and content written.") - - await self._aparse_document(Path(temp_file_path), request) - except Exception as e: - logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) - self._key_value_store.upsert(filename, Status.ERROR) - - async def _aparse_document( - self, - s3_file_path: Path, - request: Request, - ): - logger.debug("START parsing of the document %s", s3_file_path) - filename = s3_file_path.name - - self._file_service.upload_file(s3_file_path, filename) - self._key_value_store.upsert(filename, Status.PROCESSING) - - information_pieces = self._document_extractor.extract_from_file_post(ExtractionRequest(path_on_s3=filename)) - if not information_pieces: - self._key_value_store.upsert(filename, Status.ERROR) - logger.error("No information pieces found in the document: %s", filename) - raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="No information pieces found") - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] - host_base_url = str(request.base_url) - document_url = f"{host_base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(filename)}" - - chunked_documents = self._chunker.chunk(documents) - - for idx, chunk in enumerate(chunked_documents): - if chunk.metadata["id"] in chunk.metadata["related"]: - chunk.metadata["related"].remove(chunk.metadata["id"]) - chunk.metadata.update( - { - "chunk": idx, - "chunk_length": len(chunk.page_content), - "document_url": document_url, - } - ) - - enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] - - self._rag_api.upload_information_piece(rag_information_pieces) - self._key_value_store.upsert(filename, Status.READY) - logger.info("File uploaded successfully: %s", filename) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1b2f31c..d520293 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,10 +1,13 @@ from http.client import HTTPException import logging -from typing import Optional +import os +from pathlib import Path +from typing import Optional, Tuple, Union from threading import Thread import urllib +import tempfile -from pydantic import StrictStr +from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run @@ -16,7 +19,7 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -28,7 +31,7 @@ class DefaultSourceUploader(SourceUploader): def __init__( self, - extractor_api: ExtractorApi, + extractor_api: ExtractorApiClient, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -60,8 +63,14 @@ async def upload_source( self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status + filename = None + if file: + content = await file.read() + filename = Path("/tmp/" + file.filename) + with open(filename, "wb") as tmpfile: + tmpfile.write(content) thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, file, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, filename, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -79,11 +88,15 @@ async def _handle_source_upload( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], + file, #: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]], kwargs: Optional[list[KeyValuePair]], ): try: - information_pieces = self._extractor_api.extract(type, name, file, kwargs) + if file: + information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) + else: + information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) + if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) @@ -98,10 +111,16 @@ async def _handle_source_upload( ] # Replace old document - await self._document_deleter.adelete_document(source_name) + try: + await self._document_deleter.adelete_document(source_name) + except Exception as e: + # deletion is allowed to fail + pass self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("File uploaded successfully: %s", source_name) + if file: + os.remove(file) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py b/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py deleted file mode 100644 index 552535f..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Module for the ConfluenceSettingsMapper class.""" - -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings - - -class ConfluenceSettingsMapper: - """Mapper class for converting ConfluenceSettings to ConfluenceParameters.""" - - @staticmethod - def map_settings_to_params(settings: ConfluenceSettings, index) -> ConfluenceParameters: - """ - Map ConfluenceSettings to ConfluenceParameters. - - Parameters - ---------- - settings : ConfluenceSettings - The settings object containing Confluence configuration. - - Returns - ------- - ConfluenceParameters - The parameters object for API consumption. - """ - return ConfluenceParameters( - url=settings.url[index], - token=settings.token[index], - space_key=settings.space_key[index], - include_attachments=settings.include_attachments[index], - keep_markdown_format=settings.keep_markdown_format[index], - keep_newlines=settings.keep_newlines[index], - document_name=settings.document_name[index], - confluence_kwargs=[{"key": "verify_ssl", "value": settings.verify_ssl[index]}], - ) diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py index a3a40ce..6f0ac2f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py +++ b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py @@ -4,10 +4,10 @@ from langchain_core.documents import Document as LangchainDocument -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( +from admin_api_lib.extractor_api_client.models.content_type import ( ContentType as ExtractorInformaType, ) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( +from admin_api_lib.extractor_api_client.models.information_piece import ( InformationPiece as ExtractorInformationPiece, ) from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py deleted file mode 100644 index acf77fc..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Contains settings regarding the confluence.""" - -from typing import Optional -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from pydantic import Field, model_validator -from pydantic_settings import BaseSettings -import logging - -logger = logging.getLogger(__name__) - - -class ConfluenceSettings(BaseSettings): - """ - Contains configuration settings for the Confluence integration. - - Parameters - ---------- - url : CommaSeparatedStrList, optional - List of Confluence URLs. - token : CommaSeparatedStrList, optional - List of authentication tokens. - space_key : CommaSeparatedStrList, optional - List of Confluence space keys. - document_name : CommaSeparatedStrList, optional - List of document names. - verify_ssl : CommaSeparatedBoolList, optional - List of booleans indicating whether SSL verification is enabled. - include_attachments : CommaSeparatedBoolList, optional - Indicates whether to include attachments in the integration. - keep_markdown_format : CommaSeparatedBoolList, optional - Determines if markdown formatting is maintained. - keep_newlines : CommaSeparatedBoolList, optional - Indicates whether newlines are preserved. - """ - - class Config: - """Config class for reading Fields from env.""" - - env_prefix = "CONFLUENCE_" - case_sensitive = False - - url: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - token: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - space_key: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - document_name: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - verify_ssl: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - include_attachments: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_markdown_format: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_newlines: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - - @model_validator(mode="after") - def check_lists_length_consistency(cls, values): - """ - Validate that all list-valued settings have the same length. - - If not, the list is adjusted accordingly. - - Parameters - ---------- - values : dict - Dictionary of configuration settings. - - Returns - ------- - dict - The validated values dictionary with consistent list lengths. - - Raises - ------ - ValueError - If any non-optional list has a different length compared to others. - """ - # Define the keys to check - keys = [ - "url", - "token", - "space_key", - "document_name", - "verify_ssl", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - ] - - lengths = {} - for key in keys: - value = getattr(values, key, None) - if value is not None: - lengths[key] = len(value) - # If there is more than one list with values, ensure they have the same length - optional_keys = ["document_name", "verify_ssl", "include_attachments", "keep_markdown_format", "keep_newlines"] - if lengths: - # Use the first encountered length as reference - ref_length = next(iter(lengths.values())) - for key, length in lengths.items(): - if length != ref_length and key not in optional_keys: - raise ValueError( - f"Confluence Settings length mismatch: Expected all lists to have {ref_length} elements, " - f"but '{key}' has {length} elements. {lengths}" - ) - - urls = getattr(values, "url", None) - if urls and len(urls) > 0: - n = len(urls) - try: - document_name = getattr(values, "document_name", None) - if not document_name or len(document_name) == 0: - values.document_name = CommaSeparatedStrList([""] * n) - elif len(document_name) != n: - raise ValueError("document_name list length mismatch") - except ValueError as e: - logger.error(f"Error setting document_name: {e}") - logger.warning("Setting document_name to default values") - document_name = getattr(values, "document_name", []) - values.document_name = CommaSeparatedStrList(document_name + [""] * (n - len(document_name))) - - try: - verify_ssl = getattr(values, "verify_ssl", None) - if not verify_ssl or len(verify_ssl) == 0: - values.verify_ssl = CommaSeparatedBoolList([True] * n) - elif len(verify_ssl) != n: - raise ValueError("verify_ssl list length mismatch") - except ValueError as e: - logger.error(f"Error setting verify_ssl: {e}") - logger.warning("Setting verify_ssl to default values") - verify_ssl = getattr(values, "verify_ssl", []) - values.verify_ssl = CommaSeparatedBoolList(verify_ssl + [True] * (n - len(verify_ssl))) - - try: - include_attachments = getattr(values, "include_attachments", None) - if not include_attachments or len(include_attachments) == 0: - values.include_attachments = CommaSeparatedBoolList([False] * n) - elif len(include_attachments) != n: - raise ValueError("include_attachments list length mismatch") - except ValueError as e: - logger.error(f"Error setting include_attachments: {e}") - logger.warning("Setting include_attachments to default values") - include_attachments = getattr(values, "include_attachments", []) - values.include_attachments = CommaSeparatedBoolList( - include_attachments + [False] * (n - len(include_attachments)) - ) - - try: - keep_markdown_format = getattr(values, "keep_markdown_format", None) - if not keep_markdown_format or len(keep_markdown_format) == 0: - values.keep_markdown_format = CommaSeparatedBoolList([True] * n) - elif len(keep_markdown_format) != n: - raise ValueError("keep_markdown_format list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_markdown_format: {e}") - logger.warning("Setting keep_markdown_format to default values") - keep_markdown_format = getattr(values, "keep_markdown_format", []) - values.keep_markdown_format = CommaSeparatedBoolList( - keep_markdown_format + [True] * (n - len(keep_markdown_format)) - ) - - try: - keep_newlines = getattr(values, "keep_newlines", None) - if not keep_newlines or len(keep_newlines) == 0: - values.keep_newlines = CommaSeparatedBoolList([True] * n) - elif len(keep_newlines) != n: - raise ValueError("keep_newlines list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_newlines: {e}") - logger.warning("Setting keep_newlines to default values") - keep_newlines = getattr(values, "keep_newlines", []) - values.keep_newlines = CommaSeparatedBoolList(keep_newlines + [True] * (n - len(keep_newlines))) - - return values diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index 262f11b..81ca3e2 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -86,8 +86,7 @@ components: properties: file: description: "" - format: binary - type: string + type: file type: description: "" type: string diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 0cbdc2b..38c9a1d 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -51,7 +51,7 @@ async def extract( type: Annotated[str, Form()], name: Annotated[str, Form()], file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]]=None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 8b54f1c..553d79a 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -35,7 +35,7 @@ def compatible_file_types(self) -> list[FileType]: """ @abstractmethod - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from given file. @@ -43,7 +43,9 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path Path to the file the information should be extracted from. - + name : str + Name of the document. + Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index cb04681..c67425d 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -12,7 +12,6 @@ from unstructured.partition.pptx import partition_pptx - from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor @@ -54,7 +53,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.DOCX, FileType.PPTX] - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from a given file based on its extension. @@ -62,7 +61,8 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path The path to the file from which content is to be extracted. - + name : str + Name of the document. Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index 01eb6bf..8d5bd35 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -37,7 +37,7 @@ class PDFExtractor(InformationFileExtractor): Attributes ---------- TITLE_PATTERN : re.Pattern - Regular expression pattern to identify titles in the text. + Regular expression pattern to identify titles in the text.document TITLE_PATTERN_MULTILINE : re.Pattern Regular expression pattern to identify titles in the text with multiline support. """ @@ -104,13 +104,15 @@ def _create_information_piece( page_content=content, ) - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """Extract content from given file. Parameters ---------- file_path : Path Path to the file the information should be extracted from. + name : str + Name of the document. Returns ------- @@ -136,7 +138,7 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie page=page, temp_dir=temp_dir, title=current_title, - document_name=file_path.name, + document_name=name, ) pdf_elements += new_pdf_elements diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index 2a9d21c..e7523b6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -45,7 +45,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.XML] - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from an XML file and processes the elements. @@ -53,6 +53,8 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path The path to the XML file to be processed. + name : str + Name of the document. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py index dfb7031..04abb2c 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py @@ -40,7 +40,7 @@ def __init__(self, file_service: FileService, available_extractors: list[Informa available_extractors : list of InformationExtractor A list of available information extractors to be used by the GeneralExtractor. """ - self._file_service=file_service + self._file_service = file_service self._available_extractors = available_extractors @property @@ -84,7 +84,7 @@ async def aextract_content( ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - return await correct_extractors[-1].aextract_content(temp_file_path) + return await correct_extractors[-1].aextract_content(temp_file_path, name) except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) raise e diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index 64597dd..fb432c6 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,6 +3,10 @@ # coding: utf-8 # flake8: noqa: D105 +from asyncio import FIRST_COMPLETED, CancelledError, create_task, wait +from contextlib import suppress +import logging +from time import sleep from typing import Dict, List # noqa: F401 import importlib import pkgutil @@ -32,6 +36,7 @@ from rag_core_api.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictStr from typing import Any, List +import logging from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse From 2e591c3900c457497d270f01a3add5120e0e1536 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:16:20 +0200 Subject: [PATCH 06/43] wip --- .../src/admin_api_lib/apis/admin_api.py | 55 +++++-- .../src/admin_api_lib/apis/admin_api_base.py | 18 +-- .../api_endpoints/default_source_uploader.py | 2 +- .../admin_api_lib/models/document_status.py | 6 +- .../models/http_validation_error.py | 101 ++++++++++++ .../admin_api_lib/models/key_value_pair.py | 24 +-- .../src/admin_api_lib/models/status.py | 4 +- .../src/admin_api_lib/models/upload_source.py | 8 +- .../admin_api_lib/models/validation_error.py | 105 ++++++++++++ .../models/validation_error_loc_inner.py | 153 ++++++++++++++++++ 10 files changed, 429 insertions(+), 47 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/models/http_validation_error.py create mode 100644 admin-api-lib/src/admin_api_lib/models/validation_error.py create mode 100644 admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 9d32286..15f8438 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,6 +1,6 @@ # coding: utf-8 -from typing import Dict, List, Annotated # noqa: F401 +from typing import Dict, List # noqa: F401 import importlib import pkgutil @@ -26,9 +26,10 @@ from admin_api_lib.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Union from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair @@ -44,12 +45,14 @@ responses={ 200: {"description": "Deleted"}, 500: {"description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Delete Document", response_model_by_alias=True, ) async def delete_document( - identification: str = Path(..., description=""), + identification: StrictStr = Path(..., description=""), ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -75,12 +78,16 @@ async def delete_document( 400: {"model": str, "description": "Bad request"}, 404: {"model": str, "description": "Document not found."}, 500: {"model": str, "description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Document Reference Id Get", response_model_by_alias=True, ) -async def document_reference_id_get( - identification: str = Path(..., description="Identifier of the pdf document."), +async def document_reference( + identification: Annotated[StrictStr, Field(description="Identifier of the document.")] = Path( + ..., description="Identifier of the document." + ), ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -97,7 +104,7 @@ async def document_reference_id_get( """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) + return await BaseAdminApi.subclasses[0]().document_reference(identification) @router.get( @@ -107,6 +114,7 @@ async def document_reference_id_get( 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Get All Documents Status", response_model_by_alias=True, ) async def get_all_documents_status() -> List[DocumentStatus]: @@ -123,25 +131,48 @@ async def get_all_documents_status() -> List[DocumentStatus]: return await BaseAdminApi.subclasses[0]().get_all_documents_status() +@router.post( + "/upload_file", + responses={ + 200: {"model": object, "description": "ok"}, + 400: {"description": "Bad request"}, + 422: {"description": "Unprocessable Content"}, + 500: {"description": "Internal server error"}, + }, + tags=["admin"], + summary="Upload File", + response_model_by_alias=True, +) +async def upload_file( + file: UploadFile, + request: Request, +) -> object: + """Uploads user selected sources.""" + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_file(file) + + @router.post( "/upload_source", responses={ 200: {"description": "ok"}, 400: {"description": "Bad request"}, - 422: {"description": "If no text has been extracted from the file."}, + 422: {"description": "Unprocessable Content"}, 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Upload Source", response_model_by_alias=True, ) async def upload_source( + request: Request, - type: Annotated[str, Form()], - name: Annotated[str, Form()], - file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, + type: StrictStr = Query(None, description="", alias="type"), + name: StrictStr = Query(None, description="", alias="name"), + key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs, request) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 8aebb8b..8835113 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -3,11 +3,12 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Union from typing_extensions import Annotated -from fastapi import Request, Response, UploadFile from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair +from fastapi import Request, Response, UploadFile class BaseAdminApi: @@ -20,7 +21,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, identification: StrictStr, - ) -> None: + ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -34,9 +35,9 @@ async def delete_document( None """ - async def document_reference_id_get( + async def document_reference( self, - identification: Annotated[StrictStr, Field(description="Identifier of the pdf document.")], + identification: Annotated[StrictStr, Field(description="Identifier of the document.")], ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -68,9 +69,8 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + key_value_pair: List[KeyValuePair], request: Request, ) -> None: - """Uploads user selected sources.""" - ... + """Uploads user selected source.""" + diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index d520293..196d8a7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -92,7 +92,7 @@ async def _handle_source_upload( kwargs: Optional[list[KeyValuePair]], ): try: - if file: + if file: information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) else: information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index fedce07..ff2f94a 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -29,7 +29,9 @@ class DocumentStatus(BaseModel): - """ """ # noqa: E501 + """ + DocumentStatus + """ # noqa: E501 name: StrictStr status: Status diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py new file mode 100644 index 0000000..40f6013 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -0,0 +1,101 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.models.validation_error import ValidationError + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class HTTPValidationError(BaseModel): + """ + HTTPValidationError + """ # noqa: E501 + + detail: Optional[List[ValidationError]] = None + __properties: ClassVar[List[str]] = ["detail"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of HTTPValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in detail (list) + _items = [] + if self.detail: + for _item in self.detail: + if _item: + _items.append(_item.to_dict()) + _dict["detail"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of HTTPValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "detail": ( + [ValidationError.from_dict(_item) for _item in obj.get("detail")] + if obj.get("detail") is not None + else None + ) + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 2d2fe5e..82c0c37 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -18,8 +18,8 @@ import json -from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List, Optional +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -28,10 +28,12 @@ class KeyValuePair(BaseModel): - """ """ # noqa: E501 + """ + KeyValuePair + """ # noqa: E501 - key: Optional[Any] = None - value: Optional[Any] = None + key: StrictStr + value: StrictStr __properties: ClassVar[List[str]] = ["key", "value"] model_config = { @@ -69,16 +71,6 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # set to None if key (nullable) is None - # and model_fields_set contains the field - if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None - - # set to None if value (nullable) is None - # and model_fields_set contains the field - if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None - return _dict @classmethod diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 5c7836f..e4ac64b 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -26,8 +26,6 @@ class Status(str, Enum): - """ """ - """ allowed enum values """ diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py index e90690f..1d86e38 100644 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -17,7 +17,7 @@ import re # noqa: F401 import json - +from fastapi import UploadFile from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from admin_api_lib.models.key_value_pair import KeyValuePair @@ -31,10 +31,10 @@ class UploadSource(BaseModel): """ """ # noqa: E501 - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr + name: StrictStr kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + __properties: ClassVar[List[str]] = ["type", "name", "kwargs"] model_config = { "populate_by_name": True, @@ -91,7 +91,7 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { - "file": obj.get("file"), + "name": obj.get("name"), "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error.py b/admin-api-lib/src/admin_api_lib/models/validation_error.py new file mode 100644 index 0000000..f922b21 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error.py @@ -0,0 +1,105 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.models.validation_error_loc_inner import ValidationErrorLocInner + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ValidationError(BaseModel): + """ + ValidationError + """ # noqa: E501 + + loc: List[ValidationErrorLocInner] + msg: StrictStr + type: StrictStr + __properties: ClassVar[List[str]] = ["loc", "msg", "type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in loc (list) + _items = [] + if self.loc: + for _item in self.loc: + if _item: + _items.append(_item.to_dict()) + _dict["loc"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "loc": ( + [ValidationErrorLocInner.from_dict(_item) for _item in obj.get("loc")] + if obj.get("loc") is not None + else None + ), + "msg": obj.get("msg"), + "type": obj.get("type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py new file mode 100644 index 0000000..8cd53fe --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -0,0 +1,153 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +from inspect import getfullargspec +import json +import pprint +import re # noqa: F401 + + +from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr, ValidationError, field_validator +from typing import Optional +from typing import Union, Any, List, TYPE_CHECKING, Optional, Dict +from typing_extensions import Literal +from pydantic import StrictStr, Field + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS = ["int", "str"] + + +class ValidationErrorLocInner(BaseModel): + """ + ValidationErrorLocInner + """ + + # data type: str + anyof_schema_1_validator: Optional[StrictStr] = None + # data type: int + anyof_schema_2_validator: Optional[StrictInt] = None + if TYPE_CHECKING: + actual_instance: Optional[Union[int, str]] = None + else: + actual_instance: Any = None + any_of_schemas: List[str] = Literal[VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS] + + model_config = { + "validate_assignment": True, + "protected_namespaces": (), + } + + def __init__(self, *args, **kwargs) -> None: + if args: + if len(args) > 1: + raise ValueError("If a position argument is used, only 1 is allowed to set `actual_instance`") + if kwargs: + raise ValueError("If a position argument is used, keyword arguments cannot be used.") + super().__init__(actual_instance=args[0]) + else: + super().__init__(**kwargs) + + @field_validator("actual_instance") + def actual_instance_must_validate_anyof(cls, v): + instance = ValidationErrorLocInner.model_construct() + error_messages = [] + # validate data type: str + try: + instance.anyof_schema_1_validator = v + return v + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + # validate data type: int + try: + instance.anyof_schema_2_validator = v + return v + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + if error_messages: + # no match + raise ValueError( + "No match found when setting the actual_instance in ValidationErrorLocInner with anyOf schemas: int, str. Details: " + + ", ".join(error_messages) + ) + else: + return v + + @classmethod + def from_dict(cls, obj: dict) -> Self: + return cls.from_json(json.dumps(obj)) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Returns the object represented by the json string""" + instance = cls.model_construct() + error_messages = [] + # deserialize data into str + try: + # validation + instance.anyof_schema_1_validator = json.loads(json_str) + # assign value to actual_instance + instance.actual_instance = instance.anyof_schema_1_validator + return instance + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + # deserialize data into int + try: + # validation + instance.anyof_schema_2_validator = json.loads(json_str) + # assign value to actual_instance + instance.actual_instance = instance.anyof_schema_2_validator + return instance + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + + if error_messages: + # no match + raise ValueError( + "No match found when deserializing the JSON string into ValidationErrorLocInner with anyOf schemas: int, str. Details: " + + ", ".join(error_messages) + ) + else: + return instance + + def to_json(self) -> str: + """Returns the JSON representation of the actual instance""" + if self.actual_instance is None: + return "null" + + to_json = getattr(self.actual_instance, "to_json", None) + if callable(to_json): + return self.actual_instance.to_json() + else: + return json.dumps(self.actual_instance) + + def to_dict(self) -> Dict: + """Returns the dict representation of the actual instance""" + if self.actual_instance is None: + return "null" + + to_json = getattr(self.actual_instance, "to_json", None) + if callable(to_json): + return self.actual_instance.to_dict() + else: + # primitive type + return self.actual_instance + + def to_str(self) -> str: + """Returns the string representation of the actual instance""" + return pprint.pformat(self.model_dump()) From cf8b892d4010d075d818e5ead78854585dc06085 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:22:53 +0200 Subject: [PATCH 07/43] wip --- admin-api-lib/openapi.yaml | 597 ++++++++++++++++++++++++++----------- 1 file changed, 428 insertions(+), 169 deletions(-) diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index efbb2f6..0c1d883 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,169 +1,428 @@ -openapi: 3.0.2 -info: - description: The API is used for the communication between the admin frontend and - the admin backend in the rag project. - title: admin-api-lib - version: 1.0.0 -servers: -- url: / -paths: - /document_reference/{identification}: - get: - operationId: document_reference_id_get - parameters: - - description: Identifier of the pdf document. - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - content: - application/pdf: - schema: - format: binary - type: string - description: Returns the pdf in binary form. - "400": - content: - application/json: - schema: - type: string - description: Bad request - "404": - content: - application/json: - schema: - type: string - description: Document not found. - "500": - content: - application/json: - schema: - type: string - description: Internal server error - tags: - - admin - /delete_document/{identification}: - delete: - operationId: delete_document - parameters: - - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - description: Deleted - "500": - description: Internal server error - tags: - - admin - /all_documents_status: - get: - operationId: get_all_documents_status - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/document_status' - type: array - description: List of document links - "500": - description: Internal server error - tags: - - admin - /upload_source: - post: - description: Uploads user selected sources. - operationId: upload_source - requestBody: - content: - multipart/form-data: - schema: - $ref: '#/components/schemas/upload_source' - description: The source to upload. - required: true - responses: - "200": - description: ok - "400": - description: Bad request - "422": - description: If no text has been extracted from the file. - "500": - description: Internal server error - tags: - - admin -components: - schemas: - status: - description: "" - enum: - - UPLOADING - - PROCESSING - - READY - - ERROR - title: status - type: string - document_status: - description: "" - example: - name: name - status: UPLOADING - properties: - name: - description: "" - title: name - type: string - status: - $ref: '#/components/schemas/status' - required: - - name - - status - title: document_status - type: object - upload_source: - description: "" - properties: - file: - description: "" - format: binary - type: string - type: - description: "" - type: string - kwargs: - description: "" - items: - $ref: '#/components/schemas/key_value_pair' - type: array - name: - description: "" - type: string - required: - - name - - type - type: object - key_value_pair: - description: "" - example: - value: value - key: key - properties: - key: - description: "" - title: Key - value: - description: "" - title: Value - title: MetaInformationPiece - type: object +{ + "openapi": "3.1.0", + "info": { + "title": "admin-api-lib", + "description": "The API is used for the communication between the admin frontend and the admin backend in the rag project.", + "version": "1.0.0" + }, + "servers": [ + { + "url": "/api" + } + ], + "paths": { + "/delete_document/{identification}": { + "delete": { + "tags": [ + "admin" + ], + "summary": "Delete Document", + "description": "Asynchronously deletes a document based on the provided identification.\n\nParameters\n----------\nidentification : str\n The unique identifier of the document to be deleted.\n\nReturns\n-------\nNone", + "operationId": "delete_document", + "parameters": [ + { + "name": "identification", + "in": "path", + "required": true, + "schema": { + "type": "string", + "description": "", + "title": "Identification" + } + } + ], + "responses": { + "200": { + "description": "Deleted", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/document_reference/{identification}": { + "get": { + "tags": [ + "admin" + ], + "summary": "Document Reference Id Get", + "description": "Asynchronously retrieve a document reference by its identification.\n\nParameters\n----------\nidentification : str\n The unique identifier for the document reference.\n\nReturns\n-------\nResponse\n The response object containing the document reference details.", + "operationId": "document_reference", + "parameters": [ + { + "name": "identification", + "in": "path", + "required": true, + "schema": { + "type": "string", + "description": "Identifier of the document.", + "title": "Identification" + }, + "description": "Identifier of the document." + } + ], + "responses": { + "200": { + "description": "Returns the pdf in binary form.", + "content": { + "application/json": { + "schema": { + "type": "string", + "format": "binary", + "title": "Response 200 Document Reference Document Reference Identification Get" + } + } + } + }, + "400": { + "description": "Bad request", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 400 Document Reference Document Reference Identification Get" + } + } + } + }, + "404": { + "description": "Document not found.", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 404 Document Reference Document Reference Identification Get" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 500 Document Reference Document Reference Identification Get" + } + } + } + } + } + } + }, + "/all_documents_status": { + "get": { + "tags": [ + "admin" + ], + "summary": "Get All Documents Status", + "description": "Asynchronously retrieves the status of all documents.\n\nReturns\n-------\nlist[DocumentStatus]\n A list containing the status of all documents.", + "operationId": "get_all_documents_status", + "responses": { + "200": { + "description": "List of document links", + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/DocumentStatus" + }, + "type": "array", + "title": "Response 200 Get All Documents Status All Documents Status Get" + } + } + } + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/upload_file": { + "post": { + "tags": [ + "admin" + ], + "summary": "Upload File", + "description": "Uploads user selected sources.", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_upload_file_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "ok", + "content": { + "application/json": { + "schema": { + "title": "Response 200 Upload File Upload File Post" + } + } + } + }, + "400": { + "description": "Bad request" + }, + "422": { + "description": "Unprocessable Content" + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/upload_source": { + "post": { + "tags": [ + "admin" + ], + "summary": "Upload Source", + "description": "Uploads user selected sources.", + "operationId": "upload_source", + "parameters": [ + { + "name": "type", + "in": "query", + "required": false, + "schema": { + "type": "string", + "description": "", + "title": "Type" + } + }, + { + "name": "name", + "in": "query", + "required": false, + "schema": { + "type": "string", + "description": "", + "title": "Name" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/KeyValuePair" + }, + "description": "", + "title": "Key Value Pair" + } + } + } + }, + "responses": { + "200": { + "description": "ok", + "content": { + "application/json": { + "schema": {} + } + } + }, + "400": { + "description": "Bad request" + }, + "422": { + "description": "Unprocessable Content" + }, + "500": { + "description": "Internal server error" + } + } + } + } + }, + "components": { + "schemas": { + "Body_upload_file_upload_file_post": { + "properties": { + "file": { + "type": "string", + "format": "binary", + "title": "File" + } + }, + "type": "object", + "required": [ + "file" + ], + "title": "Body_upload_file_upload_file_post" + }, + "DocumentStatus": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "status": { + "$ref": "#/components/schemas/Status" + } + }, + "type": "object", + "required": [ + "name", + "status" + ], + "title": "DocumentStatus", + "description": "DocumentStatus" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError", + "description": "HTTPValidationError" + }, + "KeyValuePair": { + "properties": { + "key": { + "type": "string", + "title": "Key" + }, + "value": { + "type": "string", + "title": "Value" + } + }, + "type": "object", + "required": [ + "key", + "value" + ], + "title": "KeyValuePair", + "description": "KeyValuePair" + }, + "Status": { + "type": "string", + "enum": [ + "UPLOADING", + "PROCESSING", + "READY", + "ERROR" + ], + "title": "Status", + "description": "allowed enum values" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "$ref": "#/components/schemas/ValidationErrorLocInner" + }, + "type": "array", + "title": "Loc" + }, + "msg": { + "type": "string", + "title": "Msg" + }, + "type": { + "type": "string", + "title": "Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError", + "description": "ValidationError" + }, + "ValidationErrorLocInner": { + "properties": { + "anyof_schema_1_validator": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Anyof Schema 1 Validator" + }, + "anyof_schema_2_validator": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Anyof Schema 2 Validator" + }, + "actual_instance": { + "title": "Actual Instance" + }, + "any_of_schemas": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Any Of Schemas" + } + }, + "type": "object", + "title": "ValidationErrorLocInner", + "description": "ValidationErrorLocInner" + } + } + } +} \ No newline at end of file From 8ee912cf0d9c4d3851e5e0bec478259460167954 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:26:23 +0200 Subject: [PATCH 08/43] wip --- admin-api-lib/openapi.yaml | 792 ++++++++---------- .../src/admin_api_lib/apis/admin_api_base.py | 7 + .../models/http_validation_error.py | 5 + .../src/admin_api_lib/models/status.py | 4 + .../src/admin_api_lib/models/upload_source.py | 103 --- .../models/validation_error_loc_inner.py | 182 ++-- 6 files changed, 452 insertions(+), 641 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/models/upload_source.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 0c1d883..86d433a 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,428 +1,364 @@ -{ - "openapi": "3.1.0", - "info": { - "title": "admin-api-lib", - "description": "The API is used for the communication between the admin frontend and the admin backend in the rag project.", - "version": "1.0.0" - }, - "servers": [ - { - "url": "/api" - } - ], - "paths": { - "/delete_document/{identification}": { - "delete": { - "tags": [ - "admin" - ], - "summary": "Delete Document", - "description": "Asynchronously deletes a document based on the provided identification.\n\nParameters\n----------\nidentification : str\n The unique identifier of the document to be deleted.\n\nReturns\n-------\nNone", - "operationId": "delete_document", - "parameters": [ - { - "name": "identification", - "in": "path", - "required": true, - "schema": { - "type": "string", - "description": "", - "title": "Identification" - } - } - ], - "responses": { - "200": { - "description": "Deleted", - "content": { - "application/json": { - "schema": {} - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/document_reference/{identification}": { - "get": { - "tags": [ - "admin" - ], - "summary": "Document Reference Id Get", - "description": "Asynchronously retrieve a document reference by its identification.\n\nParameters\n----------\nidentification : str\n The unique identifier for the document reference.\n\nReturns\n-------\nResponse\n The response object containing the document reference details.", - "operationId": "document_reference", - "parameters": [ - { - "name": "identification", - "in": "path", - "required": true, - "schema": { - "type": "string", - "description": "Identifier of the document.", - "title": "Identification" - }, - "description": "Identifier of the document." - } - ], - "responses": { - "200": { - "description": "Returns the pdf in binary form.", - "content": { - "application/json": { - "schema": { - "type": "string", - "format": "binary", - "title": "Response 200 Document Reference Document Reference Identification Get" - } - } - } - }, - "400": { - "description": "Bad request", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 400 Document Reference Document Reference Identification Get" - } - } - } - }, - "404": { - "description": "Document not found.", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 404 Document Reference Document Reference Identification Get" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 500 Document Reference Document Reference Identification Get" - } - } - } - } - } - } - }, - "/all_documents_status": { - "get": { - "tags": [ - "admin" - ], - "summary": "Get All Documents Status", - "description": "Asynchronously retrieves the status of all documents.\n\nReturns\n-------\nlist[DocumentStatus]\n A list containing the status of all documents.", - "operationId": "get_all_documents_status", - "responses": { - "200": { - "description": "List of document links", - "content": { - "application/json": { - "schema": { - "items": { - "$ref": "#/components/schemas/DocumentStatus" - }, - "type": "array", - "title": "Response 200 Get All Documents Status All Documents Status Get" - } - } - } - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/upload_file": { - "post": { - "tags": [ - "admin" - ], - "summary": "Upload File", - "description": "Uploads user selected sources.", - "operationId": "upload_file", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { - "$ref": "#/components/schemas/Body_upload_file_upload_file_post" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "ok", - "content": { - "application/json": { - "schema": { - "title": "Response 200 Upload File Upload File Post" - } - } - } - }, - "400": { - "description": "Bad request" - }, - "422": { - "description": "Unprocessable Content" - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/upload_source": { - "post": { - "tags": [ - "admin" - ], - "summary": "Upload Source", - "description": "Uploads user selected sources.", - "operationId": "upload_source", - "parameters": [ - { - "name": "type", - "in": "query", - "required": false, - "schema": { - "type": "string", - "description": "", - "title": "Type" - } - }, - { - "name": "name", - "in": "query", - "required": false, - "schema": { - "type": "string", - "description": "", - "title": "Name" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/KeyValuePair" - }, - "description": "", - "title": "Key Value Pair" - } - } - } - }, - "responses": { - "200": { - "description": "ok", - "content": { - "application/json": { - "schema": {} - } - } - }, - "400": { - "description": "Bad request" - }, - "422": { - "description": "Unprocessable Content" - }, - "500": { - "description": "Internal server error" - } - } - } - } - }, - "components": { - "schemas": { - "Body_upload_file_upload_file_post": { - "properties": { - "file": { - "type": "string", - "format": "binary", - "title": "File" - } - }, - "type": "object", - "required": [ - "file" - ], - "title": "Body_upload_file_upload_file_post" - }, - "DocumentStatus": { - "properties": { - "name": { - "type": "string", - "title": "Name" - }, - "status": { - "$ref": "#/components/schemas/Status" - } - }, - "type": "object", - "required": [ - "name", - "status" - ], - "title": "DocumentStatus", - "description": "DocumentStatus" - }, - "HTTPValidationError": { - "properties": { - "detail": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError", - "description": "HTTPValidationError" - }, - "KeyValuePair": { - "properties": { - "key": { - "type": "string", - "title": "Key" - }, - "value": { - "type": "string", - "title": "Value" - } - }, - "type": "object", - "required": [ - "key", - "value" - ], - "title": "KeyValuePair", - "description": "KeyValuePair" - }, - "Status": { - "type": "string", - "enum": [ - "UPLOADING", - "PROCESSING", - "READY", - "ERROR" - ], - "title": "Status", - "description": "allowed enum values" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "$ref": "#/components/schemas/ValidationErrorLocInner" - }, - "type": "array", - "title": "Loc" - }, - "msg": { - "type": "string", - "title": "Msg" - }, - "type": { - "type": "string", - "title": "Type" - } - }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError", - "description": "ValidationError" - }, - "ValidationErrorLocInner": { - "properties": { - "anyof_schema_1_validator": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Anyof Schema 1 Validator" - }, - "anyof_schema_2_validator": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Anyof Schema 2 Validator" - }, - "actual_instance": { - "title": "Actual Instance" - }, - "any_of_schemas": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Any Of Schemas" - } - }, - "type": "object", - "title": "ValidationErrorLocInner", - "description": "ValidationErrorLocInner" - } - } - } -} \ No newline at end of file +openapi: 3.1.0 +info: + description: The API is used for the communication between the admin frontend + and the admin backend in the rag project. + title: admin-api-lib + version: 1.0.0 +servers: +- url: /api +paths: + /delete_document/{identification}: + delete: + description: |- + Asynchronously deletes a document based on the provided identification. + + Parameters + ---------- + identification : str + The unique identifier of the document to be deleted. + + Returns + ------- + None + operationId: delete_document + parameters: + - explode: false + in: path + name: identification + required: true + schema: + description: "" + title: Identification + type: string + style: simple + responses: + "200": + content: + application/json: + schema: {} + description: Deleted + "422": + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + "500": + description: Internal server error + summary: Delete Document + tags: + - admin + /document_reference/{identification}: + get: + description: |- + Asynchronously retrieve a document reference by its identification. + + Parameters + ---------- + identification : str + The unique identifier for the document reference. + + Returns + ------- + Response + The response object containing the document reference details. + operationId: document_reference + parameters: + - description: Identifier of the document. + explode: false + in: path + name: identification + required: true + schema: + description: Identifier of the document. + title: Identification + type: string + style: simple + responses: + "200": + content: + application/json: + schema: + format: binary + title: Response 200 Document Reference Document Reference Identification Get + type: string + description: Returns the pdf in binary form. + "400": + content: + application/json: + schema: + title: Response 400 Document Reference Document Reference Identification Get + type: string + description: Bad request + "404": + content: + application/json: + schema: + title: Response 404 Document Reference Document Reference Identification Get + type: string + description: Document not found. + "422": + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + "500": + content: + application/json: + schema: + title: Response 500 Document Reference Document Reference Identification Get + type: string + description: Internal server error + summary: Document Reference Id Get + tags: + - admin + /all_documents_status: + get: + description: |- + Asynchronously retrieves the status of all documents. + + Returns + ------- + list[DocumentStatus] + A list containing the status of all documents. + operationId: get_all_documents_status + responses: + "200": + content: + application/json: + schema: + items: + $ref: '#/components/schemas/DocumentStatus' + type: array + description: List of document links + "500": + description: Internal server error + summary: Get All Documents Status + tags: + - admin + /upload_file: + post: + description: Uploads user selected sources. + operationId: upload_file + requestBody: + content: + multipart/form-data: + schema: + $ref: '#/components/schemas/Body_upload_file_upload_file_post' + required: true + responses: + "200": + content: + application/json: + schema: {} + description: ok + "400": + description: Bad request + "422": + description: Unprocessable Content + "500": + description: Internal server error + summary: Upload File + tags: + - admin + /upload_source: + post: + description: Uploads user selected sources. + operationId: upload_source + parameters: + - explode: true + in: query + name: type + required: false + schema: + description: "" + title: Type + type: string + style: form + - explode: true + in: query + name: name + required: false + schema: + description: "" + title: Name + type: string + style: form + requestBody: + content: + application/json: + schema: + description: "" + items: + $ref: '#/components/schemas/KeyValuePair' + type: array + responses: + "200": + content: + application/json: + schema: {} + description: ok + "400": + description: Bad request + "422": + description: Unprocessable Content + "500": + description: Internal server error + summary: Upload Source + tags: + - admin +components: + schemas: + Body_upload_file_upload_file_post: + properties: + file: + format: binary + title: File + type: string + required: + - file + title: Body_upload_file_upload_file_post + DocumentStatus: + description: DocumentStatus + example: + name: name + status: UPLOADING + properties: + name: + title: Name + type: string + status: + $ref: '#/components/schemas/Status' + required: + - name + - status + title: DocumentStatus + HTTPValidationError: + description: HTTPValidationError + example: + detail: + - msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + - msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + properties: + detail: + items: + $ref: '#/components/schemas/ValidationError' + nullable: true + title: detail + type: array + title: HTTPValidationError + KeyValuePair: + description: KeyValuePair + example: + value: value + key: key + properties: + key: + title: Key + type: string + value: + title: Value + type: string + required: + - key + - value + title: KeyValuePair + Status: + description: allowed enum values + enum: + - UPLOADING + - PROCESSING + - READY + - ERROR + title: Status + type: string + ValidationError: + description: ValidationError + example: + msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + properties: + loc: + items: + $ref: '#/components/schemas/ValidationErrorLocInner' + title: loc + type: array + msg: + title: Msg + type: string + type: + title: Type + type: string + required: + - loc + - msg + - type + title: ValidationError + ValidationErrorLocInner: + description: ValidationErrorLocInner + example: + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + properties: + anyof_schema_1_validator: + nullable: true + title: anyof_schema_1_validator + type: string + anyof_schema_2_validator: + nullable: true + title: anyof_schema_2_validator + type: integer + actual_instance: + title: actual_instance + any_of_schemas: + items: + type: string + title: any_of_schemas + type: array + title: ValidationErrorLocInner diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 8835113..df6b1a3 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -74,3 +74,10 @@ async def upload_source( ) -> None: """Uploads user selected source.""" + async def upload_file( + self, + file: UploadFile, + request: Request, + ) -> object: + """Uploads user selected file.""" + ... diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 40f6013..7e288e1 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -78,6 +78,11 @@ def to_dict(self) -> Dict[str, Any]: if _item: _items.append(_item.to_dict()) _dict["detail"] = _items + # set to None if detail (nullable) is None + # and model_fields_set contains the field + if self.detail is None and "detail" in self.model_fields_set: + _dict["detail"] = None + return _dict @classmethod diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index e4ac64b..0ab750b 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -26,6 +26,10 @@ class Status(str, Enum): + """ + allowed enum values + """ + """ allowed enum values """ diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py deleted file mode 100644 index 1d86e38..0000000 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf-8 - -""" -admin-api-lib - -The API is used for the communication between the admin frontend and the admin backend in the rag project. - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations -import pprint -import re # noqa: F401 -import json - -from fastapi import UploadFile -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from admin_api_lib.models.key_value_pair import KeyValuePair - -try: - from typing import Self -except ImportError: - from typing_extensions import Self - - -class UploadSource(BaseModel): - """ """ # noqa: E501 - - type: StrictStr - name: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["type", "name", "kwargs"] - - model_config = { - "populate_by_name": True, - "validate_assignment": True, - "protected_namespaces": (), - } - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - - @classmethod - def from_json(cls, json_str: str) -> Self: - """Create an instance of UploadSource from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - _dict = self.model_dump( - by_alias=True, - exclude={}, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item in self.kwargs: - if _item: - _items.append(_item.to_dict()) - _dict["kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Dict) -> Self: - """Create an instance of UploadSource from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "name": obj.get("name"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] - if obj.get("kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py index 8cd53fe..0100c88 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -13,141 +13,103 @@ from __future__ import annotations -from inspect import getfullargspec -import json import pprint import re # noqa: F401 +import json -from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr, ValidationError, field_validator -from typing import Optional -from typing import Union, Any, List, TYPE_CHECKING, Optional, Dict -from typing_extensions import Literal -from pydantic import StrictStr, Field +from pydantic import BaseModel, ConfigDict, StrictInt, StrictStr +from typing import Any, ClassVar, Dict, List, Optional try: from typing import Self except ImportError: from typing_extensions import Self -VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS = ["int", "str"] - class ValidationErrorLocInner(BaseModel): """ ValidationErrorLocInner - """ + """ # noqa: E501 - # data type: str anyof_schema_1_validator: Optional[StrictStr] = None - # data type: int anyof_schema_2_validator: Optional[StrictInt] = None - if TYPE_CHECKING: - actual_instance: Optional[Union[int, str]] = None - else: - actual_instance: Any = None - any_of_schemas: List[str] = Literal[VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS] + actual_instance: Optional[Any] = None + any_of_schemas: Optional[List[StrictStr]] = None + __properties: ClassVar[List[str]] = [ + "anyof_schema_1_validator", + "anyof_schema_2_validator", + "actual_instance", + "any_of_schemas", + ] model_config = { + "populate_by_name": True, "validate_assignment": True, "protected_namespaces": (), } - def __init__(self, *args, **kwargs) -> None: - if args: - if len(args) > 1: - raise ValueError("If a position argument is used, only 1 is allowed to set `actual_instance`") - if kwargs: - raise ValueError("If a position argument is used, keyword arguments cannot be used.") - super().__init__(actual_instance=args[0]) - else: - super().__init__(**kwargs) - - @field_validator("actual_instance") - def actual_instance_must_validate_anyof(cls, v): - instance = ValidationErrorLocInner.model_construct() - error_messages = [] - # validate data type: str - try: - instance.anyof_schema_1_validator = v - return v - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - # validate data type: int - try: - instance.anyof_schema_2_validator = v - return v - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - if error_messages: - # no match - raise ValueError( - "No match found when setting the actual_instance in ValidationErrorLocInner with anyOf schemas: int, str. Details: " - + ", ".join(error_messages) - ) - else: - return v + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) - @classmethod - def from_dict(cls, obj: dict) -> Self: - return cls.from_json(json.dumps(obj)) + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: - """Returns the object represented by the json string""" - instance = cls.model_construct() - error_messages = [] - # deserialize data into str - try: - # validation - instance.anyof_schema_1_validator = json.loads(json_str) - # assign value to actual_instance - instance.actual_instance = instance.anyof_schema_1_validator - return instance - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - # deserialize data into int - try: - # validation - instance.anyof_schema_2_validator = json.loads(json_str) - # assign value to actual_instance - instance.actual_instance = instance.anyof_schema_2_validator - return instance - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - - if error_messages: - # no match - raise ValueError( - "No match found when deserializing the JSON string into ValidationErrorLocInner with anyOf schemas: int, str. Details: " - + ", ".join(error_messages) - ) - else: - return instance - - def to_json(self) -> str: - """Returns the JSON representation of the actual instance""" - if self.actual_instance is None: - return "null" - - to_json = getattr(self.actual_instance, "to_json", None) - if callable(to_json): - return self.actual_instance.to_json() - else: - return json.dumps(self.actual_instance) - - def to_dict(self) -> Dict: - """Returns the dict representation of the actual instance""" - if self.actual_instance is None: - return "null" - - to_json = getattr(self.actual_instance, "to_json", None) - if callable(to_json): - return self.actual_instance.to_dict() - else: - # primitive type - return self.actual_instance + """Create an instance of ValidationErrorLocInner from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # set to None if anyof_schema_1_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_1_validator is None and "anyof_schema_1_validator" in self.model_fields_set: + _dict["anyof_schema_1_validator"] = None + + # set to None if anyof_schema_2_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_2_validator is None and "anyof_schema_2_validator" in self.model_fields_set: + _dict["anyof_schema_2_validator"] = None + + # set to None if actual_instance (nullable) is None + # and model_fields_set contains the field + if self.actual_instance is None and "actual_instance" in self.model_fields_set: + _dict["actual_instance"] = None + + return _dict - def to_str(self) -> str: - """Returns the string representation of the actual instance""" - return pprint.pformat(self.model_dump()) + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationErrorLocInner from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "anyof_schema_1_validator": obj.get("anyof_schema_1_validator"), + "anyof_schema_2_validator": obj.get("anyof_schema_2_validator"), + "actual_instance": obj.get("actual_instance"), + "any_of_schemas": obj.get("any_of_schemas"), + } + ) + return _obj From 40625379663c82e977f7739dbab7658eec002001 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:43:10 +0200 Subject: [PATCH 09/43] wip --- .../api_endpoints/file_uploader.py | 14 ++ .../api_endpoints/source_uploader.py | 3 +- .../src/admin_api_lib/apis/admin_api.py | 9 +- .../src/admin_api_lib/apis/admin_api_base.py | 2 +- .../src/admin_api_lib/impl/admin_api.py | 16 +- .../api_endpoints/default_file_uploader.py | 152 ++++++++++++++++++ .../api_endpoints/default_source_uploader.py | 27 +--- 7 files changed, 191 insertions(+), 32 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py create mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py new file mode 100644 index 0000000..aaeea8f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + +from fastapi import UploadFile + + + +class FileUploader(ABC): + + @abstractmethod + async def upload_file( + self, + base_url: str, + file: UploadFile, + ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 0c9b73e..9cdd59e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -15,6 +15,5 @@ async def upload_source( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 15f8438..4fe1e15 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -134,7 +134,7 @@ async def get_all_documents_status() -> List[DocumentStatus]: @router.post( "/upload_file", responses={ - 200: {"model": object, "description": "ok"}, + 200: {"description": "ok"}, 400: {"description": "Bad request"}, 422: {"description": "Unprocessable Content"}, 500: {"description": "Internal server error"}, @@ -146,11 +146,11 @@ async def get_all_documents_status() -> List[DocumentStatus]: async def upload_file( file: UploadFile, request: Request, -) -> object: +) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_file(file) + return await BaseAdminApi.subclasses[0]().upload_file(file, request) @router.post( @@ -166,7 +166,6 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - request: Request, type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), @@ -175,4 +174,4 @@ async def upload_source( """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair, request) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index df6b1a3..09d4d6d 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -78,6 +78,6 @@ async def upload_file( self, file: UploadFile, request: Request, - ) -> object: + ) -> None: """Uploads user selected file.""" ... diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index dd39f3c..3adbae1 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -93,12 +93,22 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + kwargs: List[KeyValuePair], request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - await source_uploader.upload_source(str(request.base_url), type, name, file, kwargs) + await source_uploader.upload_source(str(request.base_url), type, name, kwargs) + + + @inject + async def upload_file( + self, + file: UploadFile, + request: Request, + file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), + ) -> None: + await file_uploader.upload_source(str(request.base_url), file) + @inject async def document_reference_id_get( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py new file mode 100644 index 0000000..0dd5b4f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -0,0 +1,152 @@ +from http.client import HTTPException +import logging +import os +from pathlib import Path +import traceback +from typing import Optional, Tuple, Union +from threading import Thread +import urllib +import tempfile +from urllib.request import Request + +from pydantic import StrictBytes, StrictStr +from fastapi import UploadFile, status +from langchain_core.documents import Document +from asyncio import run + +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name + +logger = logging.getLogger(__name__) + + +class DefaultFileUploader(FileUploader): + + def __init__( + self, + extractor_api: ExtractorApiClient, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + ): + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + + async def upload_source( + self, + base_url: str, + file: UploadFile, + ) -> None: + self._background_threads = [t for t in self._background_threads if t.is_alive()] + + + try: + content = await file.read() + file.filename = sanitize_document_name(file.filename) + source_name = f"file:{sanitize_document_name(file.filename)}" + # TODO: check if document already in processing state + self._key_value_store.upsert( + source_name, Status.PROCESSING + ) # TODO: change to pipeline with timeout to error status + s3_path = await self._asave_new_document(content, file.filename, source_name) + thread = Thread( + target=lambda: run(self._handle_source_upload(s3_path,source_name, file.filename, base_url)) + ) + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + async def _handle_source_upload( + self, + s3_path:Path, + source_name: str, + file_name:str, + base_url: str, + ): + try: + information_pieces = self._extractor_api.extract(s3_path, source_name) + + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + + chunked_documents = self._chunker.chunk(documents) + + enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + self._add_file_url(file_name,base_url,enhanced_documents) + + rag_information_pieces = [ + self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents + ] + # Replace old document + try: + await self._document_deleter.adelete_document(source_name) + except Exception as e: + # deletion is allowed to fail + pass + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("Source uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + + def _add_file_url( + self, file: UploadFile, base_url: str, chunked_documents: list[Document] + ): + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + for idx, chunk in enumerate(chunked_documents): + if chunk.metadata["id"] in chunk.metadata["related"]: + chunk.metadata["related"].remove(chunk.metadata["id"]) + chunk.metadata.update( + { + "chunk": idx, + "chunk_length": len(chunk.page_content), + "document_url": document_url, + } + ) + + async def _asave_new_document( + self, + file_content: bytes, + filename: str, + source_name:str, + )->Path: + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / filename + with open(temp_file_path, "wb") as temp_file: + logger.debug("Temporary file created at %s.", temp_file_path) + temp_file.write(file_content) + logger.debug("Temp file created and content written.") + + self._file_service.upload_file(Path(temp_file_path), filename) + return Path(temp_file_path) + except Exception as e: + logger.error("Error during document saving: %s %s", e, traceback.format_exc()) + self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 196d8a7..5ef6a72 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -53,8 +53,7 @@ async def upload_source( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{type}:{sanitize_document_name(name)}" @@ -62,15 +61,9 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - filename = None - if file: - content = await file.read() - filename = Path("/tmp/" + file.filename) - with open(filename, "wb") as tmpfile: - tmpfile.write(content) + ) # TODO: change to pipeline with timeout to error status thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, filename, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -87,15 +80,10 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, - name: StrictStr, - file, #: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ): try: - if file: - information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) - else: - information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) + information_pieces = self._extractor_api.extract(type, source_name, kwargs) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) @@ -103,7 +91,6 @@ async def _handle_source_upload( documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] chunked_documents = self._chunker.chunk(documents) - self._add_file_url(type, file, base_url, chunked_documents) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) rag_information_pieces = [ @@ -118,9 +105,7 @@ async def _handle_source_upload( pass self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) - logger.info("File uploaded successfully: %s", source_name) - if file: - os.remove(file) + logger.info("Source uploaded successfully: %s", source_name) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) From 96b6d101744ce360403b7d59d9ec691e8bd00159 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:53:54 +0200 Subject: [PATCH 10/43] wip --- .../api_endpoints/file_uploader.py | 3 +- .../src/admin_api_lib/apis/admin_api_base.py | 4 +- .../extractor_api_client.py | 50 -- .../extractor_api_client/models/__init__.py | 19 - .../openapi_client/__init__.py | 38 + .../openapi_client/api/__init__.py | 4 + .../openapi_client/api/extractor_api.py | 516 +++++++++++++ .../openapi_client/api_client.py | 695 ++++++++++++++++++ .../openapi_client/api_response.py | 20 + .../openapi_client/configuration.py | 460 ++++++++++++ .../openapi_client/exceptions.py | 197 +++++ .../openapi_client/models/__init__.py | 21 + .../models/content_type.py | 0 .../models/extraction_parameters.py | 103 +++ .../models/extraction_request.py | 82 +++ .../models/information_piece.py | 4 +- .../models/key_value_pair.py | 0 .../openapi_client/rest.py | 209 ++++++ .../openapi_client/test/__init__.py | 0 .../openapi_client/test/test_content_type.py | 35 + .../test/test_extraction_parameters.py | 59 ++ .../test/test_extraction_request.py | 56 ++ .../openapi_client/test/test_extractor_api.py | 39 + .../test/test_information_piece.py | 62 ++ .../test/test_key_value_pair.py | 54 ++ .../src/admin_api_lib/impl/admin_api.py | 2 - .../api_endpoints/default_file_uploader.py | 25 +- .../api_endpoints/default_source_uploader.py | 6 +- extractor-api-lib/openapi.yaml | 75 +- .../extractor_api_lib/apis/extractor_api.py | 45 +- .../apis/extractor_api_base.py | 19 +- .../extractors/information_file_extractor.py | 2 +- .../models/extraction_parameters.py | 105 +++ .../models/extraction_request.py | 31 +- 34 files changed, 2880 insertions(+), 160 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/content_type.py (100%) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/information_piece.py (94%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/key_value_pair.py (100%) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py create mode 100644 extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index aaeea8f..2a33545 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -3,12 +3,11 @@ from fastapi import UploadFile - class FileUploader(ABC): @abstractmethod async def upload_file( self, base_url: str, - file: UploadFile, + file: UploadFile, ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 09d4d6d..eb5ca84 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -21,7 +21,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, identification: StrictStr, - ) -> None: + ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -73,7 +73,7 @@ async def upload_source( request: Request, ) -> None: """Uploads user selected source.""" - + async def upload_file( self, file: UploadFile, diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py deleted file mode 100644 index 78ccbf7..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py +++ /dev/null @@ -1,50 +0,0 @@ -import requests -from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece -from requests_toolbelt.multipart import MultipartEncoder - - -class ExtractorApiClient: - def __init__(self, base_url): - """ - Initialize the client with the base URL of the API. - - Args: - base_url (str): The base URL of the API. - """ - self.base_url = base_url - - def extract(self, type, name, file, kwargs=None): - """ - Send an extraction request to the API. - - Args: - file (str): The path to the file to extract from. - name (str): The name of the extraction request. - type (str): The type of extraction to perform. - kwargs (list): A list of key-value pairs to pass as additional arguments. - - Returns: - list: A list of extracted information pieces. - """ - with open(file, "rb") as openfile: - url = self.base_url + "/extract" - encoder = MultipartEncoder( - fields={ - "file": (file, openfile, "application/octet-stream"), - "name": name, - "type": type, - } - ) - if kwargs: - for pair in kwargs: - encoder.add_field(pair["key"], pair["value"]) - response = requests.post(url, headers={"Content-Type": encoder.content_type}, data=encoder) - if response.status_code == 200: - response_json = response.json() - return [InformationPiece.from_dict(x) for x in response_json] - elif response.status_code == 422: - raise ValueError("Invalid source") - elif response.status_code == 500: - raise Exception("Internal server error") - else: - raise Exception("Unknown error") diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py deleted file mode 100644 index 53560b6..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf-8 - -# flake8: noqa -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -# import models into model package -from admin_api_lib.extractor_api_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py new file mode 100644 index 0000000..edf9fd4 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +# flake8: noqa + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +__version__ = "1.0.0" + +# import apis into sdk package +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + +# import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException + +# import models into sdk package +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py new file mode 100644 index 0000000..c95ce65 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -0,0 +1,4 @@ +# flake8: noqa + +# import apis into api package +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py new file mode 100644 index 0000000..1aaddf7 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -0,0 +1,516 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + +import warnings +from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt +from typing import Any, Dict, List, Optional, Tuple, Union +from typing_extensions import Annotated + +from typing import List +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse +from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType + + +class ExtractorApi: + """NOTE: This class is auto generated by OpenAPI Generator + Ref: https://openapi-generator.tech + + Do not edit the class manually. + """ + + def __init__(self, api_client=None) -> None: + if api_client is None: + api_client = ApiClient.get_default() + self.api_client = api_client + + @validate_call + def extract_from_file_post( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> List[InformationPiece]: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ).data + + @validate_call + def extract_from_file_post_with_http_info( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> ApiResponse[List[InformationPiece]]: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ) + + @validate_call + def extract_from_file_post_without_preload_content( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> RESTResponseType: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + return response_data.response + + def _extract_from_file_post_serialize( + self, + extraction_request, + _request_auth, + _content_type, + _headers, + _host_index, + ) -> RequestSerialized: + + _host = None + + _collection_formats: Dict[str, str] = {} + + _path_params: Dict[str, str] = {} + _query_params: List[Tuple[str, str]] = [] + _header_params: Dict[str, Optional[str]] = _headers or {} + _form_params: List[Tuple[str, str]] = [] + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _body_params: Optional[bytes] = None + + # process the path parameters + # process the query parameters + # process the header parameters + # process the form parameters + # process the body parameter + if extraction_request is not None: + _body_params = extraction_request + + # set the HTTP header `Accept` + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + + # set the HTTP header `Content-Type` + if _content_type: + _header_params["Content-Type"] = _content_type + else: + _default_content_type = self.api_client.select_header_content_type(["application/json"]) + if _default_content_type is not None: + _header_params["Content-Type"] = _default_content_type + + # authentication setting + _auth_settings: List[str] = [] + + return self.api_client.param_serialize( + method="POST", + resource_path="/extract_from_file", + path_params=_path_params, + query_params=_query_params, + header_params=_header_params, + body=_body_params, + post_params=_form_params, + files=_files, + auth_settings=_auth_settings, + collection_formats=_collection_formats, + _host=_host, + _request_auth=_request_auth, + ) + + @validate_call + def extract_from_source( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> List[InformationPiece]: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ).data + + @validate_call + def extract_from_source_with_http_info( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> ApiResponse[List[InformationPiece]]: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ) + + @validate_call + def extract_from_source_without_preload_content( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> RESTResponseType: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + return response_data.response + + def _extract_from_source_serialize( + self, + extraction_parameters, + _request_auth, + _content_type, + _headers, + _host_index, + ) -> RequestSerialized: + + _host = None + + _collection_formats: Dict[str, str] = {} + + _path_params: Dict[str, str] = {} + _query_params: List[Tuple[str, str]] = [] + _header_params: Dict[str, Optional[str]] = _headers or {} + _form_params: List[Tuple[str, str]] = [] + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _body_params: Optional[bytes] = None + + # process the path parameters + # process the query parameters + # process the header parameters + # process the form parameters + # process the body parameter + if extraction_parameters is not None: + _body_params = extraction_parameters + + # set the HTTP header `Accept` + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + + # set the HTTP header `Content-Type` + if _content_type: + _header_params["Content-Type"] = _content_type + else: + _default_content_type = self.api_client.select_header_content_type(["application/json"]) + if _default_content_type is not None: + _header_params["Content-Type"] = _default_content_type + + # authentication setting + _auth_settings: List[str] = [] + + return self.api_client.param_serialize( + method="POST", + resource_path="/extract_from_source", + path_params=_path_params, + query_params=_query_params, + header_params=_header_params, + body=_body_params, + post_params=_form_params, + files=_files, + auth_settings=_auth_settings, + collection_formats=_collection_formats, + _host=_host, + _request_auth=_request_auth, + ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py new file mode 100644 index 0000000..ba8f5d2 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -0,0 +1,695 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import datetime +from dateutil.parser import parse +from enum import Enum +import decimal +import json +import mimetypes +import os +import re +import tempfile + +from urllib.parse import quote +from typing import Tuple, Optional, List, Dict, Union +from pydantic import SecretStr + +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT +import admin_api_lib.extractor_api_client.openapi_client.models +from admin_api_lib.extractor_api_client.openapi_client import rest +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( + ApiValueError, + ApiException, + BadRequestException, + UnauthorizedException, + ForbiddenException, + NotFoundException, + ServiceException, +) + +RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] + + +class ApiClient: + """Generic API client for OpenAPI client library builds. + + OpenAPI generic API client. This client handles the client- + server communication, and is invariant across implementations. Specifics of + the methods and models for each application are generated from the OpenAPI + templates. + + :param configuration: .Configuration object for this client + :param header_name: a header to pass when making calls to the API. + :param header_value: a header value to pass when making calls to + the API. + :param cookie: a cookie to include in the header when making calls + to the API + """ + + PRIMITIVE_TYPES = (float, bool, bytes, str, int) + NATIVE_TYPES_MAPPING = { + "int": int, + "long": int, # TODO remove as only py3 is supported? + "float": float, + "str": str, + "bool": bool, + "date": datetime.date, + "datetime": datetime.datetime, + "decimal": decimal.Decimal, + "object": object, + } + _pool = None + + def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: + # use default configuration if none is provided + if configuration is None: + configuration = Configuration.get_default() + self.configuration = configuration + + self.rest_client = rest.RESTClientObject(configuration) + self.default_headers = {} + if header_name is not None: + self.default_headers[header_name] = header_value + self.cookie = cookie + # Set default User-Agent. + self.user_agent = "OpenAPI-Generator/1.0.0/python" + self.client_side_validation = configuration.client_side_validation + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + + @property + def user_agent(self): + """User agent for this API client""" + return self.default_headers["User-Agent"] + + @user_agent.setter + def user_agent(self, value): + self.default_headers["User-Agent"] = value + + def set_default_header(self, header_name, header_value): + self.default_headers[header_name] = header_value + + _default = None + + @classmethod + def get_default(cls): + """Return new instance of ApiClient. + + This method returns newly created, based on default constructor, + object of ApiClient class or returns a copy of default + ApiClient. + + :return: The ApiClient object. + """ + if cls._default is None: + cls._default = ApiClient() + return cls._default + + @classmethod + def set_default(cls, default): + """Set default instance of ApiClient. + + It stores default ApiClient. + + :param default: object of ApiClient. + """ + cls._default = default + + def param_serialize( + self, + method, + resource_path, + path_params=None, + query_params=None, + header_params=None, + body=None, + post_params=None, + files=None, + auth_settings=None, + collection_formats=None, + _host=None, + _request_auth=None, + ) -> RequestSerialized: + """Builds the HTTP request params needed by the request. + :param method: Method to call. + :param resource_path: Path to method endpoint. + :param path_params: Path parameters in the url. + :param query_params: Query parameters in the url. + :param header_params: Header parameters to be + placed in the request header. + :param body: Request body. + :param post_params dict: Request post form parameters, + for `application/x-www-form-urlencoded`, `multipart/form-data`. + :param auth_settings list: Auth Settings names for the request. + :param files dict: key -> filename, value -> filepath, + for `multipart/form-data`. + :param collection_formats: dict of collection formats for path, query, + header, and post parameters. + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the authentication + in the spec for a single request. + :return: tuple of form (path, http_method, query_params, header_params, + body, post_params, files) + """ + + config = self.configuration + + # header parameters + header_params = header_params or {} + header_params.update(self.default_headers) + if self.cookie: + header_params["Cookie"] = self.cookie + if header_params: + header_params = self.sanitize_for_serialization(header_params) + header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + + # path parameters + if path_params: + path_params = self.sanitize_for_serialization(path_params) + path_params = self.parameters_to_tuples(path_params, collection_formats) + for k, v in path_params: + # specified safe chars, encode everything + resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) + + # post parameters + if post_params or files: + post_params = post_params if post_params else [] + post_params = self.sanitize_for_serialization(post_params) + post_params = self.parameters_to_tuples(post_params, collection_formats) + if files: + post_params.extend(self.files_parameters(files)) + + # auth setting + self.update_params_for_auth( + header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth + ) + + # body + if body: + body = self.sanitize_for_serialization(body) + + # request url + if _host is None or self.configuration.ignore_operation_servers: + url = self.configuration.host + resource_path + else: + # use server/host defined in path or operation instead + url = _host + resource_path + + # query parameters + if query_params: + query_params = self.sanitize_for_serialization(query_params) + url_query = self.parameters_to_url_query(query_params, collection_formats) + url += "?" + url_query + + return method, url, header_params, body, post_params + + def call_api( + self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None + ) -> rest.RESTResponse: + """Makes the HTTP request (synchronous) + :param method: Method to call. + :param url: Path to method endpoint. + :param header_params: Header parameters to be + placed in the request header. + :param body: Request body. + :param post_params dict: Request post form parameters, + for `application/x-www-form-urlencoded`, `multipart/form-data`. + :param _request_timeout: timeout setting for this request. + :return: RESTResponse + """ + + try: + # perform request and return response + response_data = self.rest_client.request( + method, + url, + headers=header_params, + body=body, + post_params=post_params, + _request_timeout=_request_timeout, + ) + + except ApiException as e: + raise e + + return response_data + + def response_deserialize( + self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None + ) -> ApiResponse[ApiResponseT]: + """Deserializes response into an object. + :param response_data: RESTResponse object to be deserialized. + :param response_types_map: dict of response types. + :return: ApiResponse + """ + + msg = "RESTResponse.read() must be called before passing it to response_deserialize()" + assert response_data.data is not None, msg + + response_type = response_types_map.get(str(response_data.status), None) + if not response_type and isinstance(response_data.status, int) and 100 <= response_data.status <= 599: + # if not found, look for '1XX', '2XX', etc. + response_type = response_types_map.get(str(response_data.status)[0] + "XX", None) + + # deserialize response data + response_text = None + return_data = None + try: + if response_type == "bytearray": + return_data = response_data.data + elif response_type == "file": + return_data = self.__deserialize_file(response_data) + elif response_type is not None: + match = None + content_type = response_data.getheader("content-type") + if content_type is not None: + match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) + encoding = match.group(1) if match else "utf-8" + response_text = response_data.data.decode(encoding) + return_data = self.deserialize(response_text, response_type, content_type) + finally: + if not 200 <= response_data.status <= 299: + raise ApiException.from_response( + http_resp=response_data, + body=response_text, + data=return_data, + ) + + return ApiResponse( + status_code=response_data.status, + data=return_data, + headers=response_data.getheaders(), + raw_data=response_data.data, + ) + + def sanitize_for_serialization(self, obj): + """Builds a JSON POST object. + + If obj is None, return None. + If obj is SecretStr, return obj.get_secret_value() + If obj is str, int, long, float, bool, return directly. + If obj is datetime.datetime, datetime.date + convert to string in iso8601 format. + If obj is decimal.Decimal return string representation. + If obj is list, sanitize each element in the list. + If obj is dict, return the dict. + If obj is OpenAPI model, return the properties dict. + + :param obj: The data to serialize. + :return: The serialized form of data. + """ + if obj is None: + return None + elif isinstance(obj, Enum): + return obj.value + elif isinstance(obj, SecretStr): + return obj.get_secret_value() + elif isinstance(obj, self.PRIMITIVE_TYPES): + return obj + elif isinstance(obj, list): + return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] + elif isinstance(obj, tuple): + return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) + elif isinstance(obj, (datetime.datetime, datetime.date)): + return obj.isoformat() + elif isinstance(obj, decimal.Decimal): + return str(obj) + + elif isinstance(obj, dict): + obj_dict = obj + else: + # Convert model obj to dict except + # attributes `openapi_types`, `attribute_map` + # and attributes which value is not None. + # Convert attribute name to json key in + # model definition for request. + if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): + obj_dict = obj.to_dict() + else: + obj_dict = obj.__dict__ + + return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} + + def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): + """Deserializes response into an object. + + :param response: RESTResponse object to be deserialized. + :param response_type: class literal for + deserialized object, or string of class name. + :param content_type: content type of response. + + :return: deserialized object. + """ + + # fetch data from response object + if content_type is None: + try: + data = json.loads(response_text) + except ValueError: + data = response_text + elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): + if response_text == "": + data = "" + else: + data = json.loads(response_text) + elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): + data = response_text + else: + raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) + + return self.__deserialize(data, response_type) + + def __deserialize(self, data, klass): + """Deserializes dict, list, str into an object. + + :param data: dict, list or str. + :param klass: class literal, or string of class name. + + :return: object. + """ + if data is None: + return None + + if isinstance(klass, str): + if klass.startswith("List["): + m = re.match(r"List\[(.*)]", klass) + assert m is not None, "Malformed List type definition" + sub_kls = m.group(1) + return [self.__deserialize(sub_data, sub_kls) for sub_data in data] + + if klass.startswith("Dict["): + m = re.match(r"Dict\[([^,]*), (.*)]", klass) + assert m is not None, "Malformed Dict type definition" + sub_kls = m.group(2) + return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} + + # convert str to class + if klass in self.NATIVE_TYPES_MAPPING: + klass = self.NATIVE_TYPES_MAPPING[klass] + else: + klass = getattr(admin_api_lib.extractor_api_client.openapi_client.models, klass) + + if klass in self.PRIMITIVE_TYPES: + return self.__deserialize_primitive(data, klass) + elif klass == object: + return self.__deserialize_object(data) + elif klass == datetime.date: + return self.__deserialize_date(data) + elif klass == datetime.datetime: + return self.__deserialize_datetime(data) + elif klass == decimal.Decimal: + return decimal.Decimal(data) + elif issubclass(klass, Enum): + return self.__deserialize_enum(data, klass) + else: + return self.__deserialize_model(data, klass) + + def parameters_to_tuples(self, params, collection_formats): + """Get parameters as list of tuples, formatting collections. + + :param params: Parameters as dict or list of two-tuples + :param dict collection_formats: Parameter collection formats + :return: Parameters as list of tuples, collections formatted + """ + new_params: List[Tuple[str, str]] = [] + if collection_formats is None: + collection_formats = {} + for k, v in params.items() if isinstance(params, dict) else params: + if k in collection_formats: + collection_format = collection_formats[k] + if collection_format == "multi": + new_params.extend((k, value) for value in v) + else: + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" + else: # csv is the default + delimiter = "," + new_params.append((k, delimiter.join(str(value) for value in v))) + else: + new_params.append((k, v)) + return new_params + + def parameters_to_url_query(self, params, collection_formats): + """Get parameters as list of tuples, formatting collections. + + :param params: Parameters as dict or list of two-tuples + :param dict collection_formats: Parameter collection formats + :return: URL query string (e.g. a=Hello%20World&b=123) + """ + new_params: List[Tuple[str, str]] = [] + if collection_formats is None: + collection_formats = {} + for k, v in params.items() if isinstance(params, dict) else params: + if isinstance(v, bool): + v = str(v).lower() + if isinstance(v, (int, float)): + v = str(v) + if isinstance(v, dict): + v = json.dumps(v) + + if k in collection_formats: + collection_format = collection_formats[k] + if collection_format == "multi": + new_params.extend((k, str(value)) for value in v) + else: + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" + else: # csv is the default + delimiter = "," + new_params.append((k, delimiter.join(quote(str(value)) for value in v))) + else: + new_params.append((k, quote(str(v)))) + + return "&".join(["=".join(map(str, item)) for item in new_params]) + + def files_parameters( + self, + files: Dict[str, Union[str, bytes, List[str], List[bytes], Tuple[str, bytes]]], + ): + """Builds form parameters. + + :param files: File parameters. + :return: Form parameters with files. + """ + params = [] + for k, v in files.items(): + if isinstance(v, str): + with open(v, "rb") as f: + filename = os.path.basename(f.name) + filedata = f.read() + elif isinstance(v, bytes): + filename = k + filedata = v + elif isinstance(v, tuple): + filename, filedata = v + elif isinstance(v, list): + for file_param in v: + params.extend(self.files_parameters({k: file_param})) + continue + else: + raise ValueError("Unsupported file value") + mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + params.append(tuple([k, tuple([filename, filedata, mimetype])])) + return params + + def select_header_accept(self, accepts: List[str]) -> Optional[str]: + """Returns `Accept` based on an array of accepts provided. + + :param accepts: List of headers. + :return: Accept (e.g. application/json). + """ + if not accepts: + return None + + for accept in accepts: + if re.search("json", accept, re.IGNORECASE): + return accept + + return accepts[0] + + def select_header_content_type(self, content_types): + """Returns `Content-Type` based on an array of content_types provided. + + :param content_types: List of content-types. + :return: Content-Type (e.g. application/json). + """ + if not content_types: + return None + + for content_type in content_types: + if re.search("json", content_type, re.IGNORECASE): + return content_type + + return content_types[0] + + def update_params_for_auth( + self, headers, queries, auth_settings, resource_path, method, body, request_auth=None + ) -> None: + """Updates header and query params based on authentication setting. + + :param headers: Header parameters dict to be updated. + :param queries: Query parameters tuple list to be updated. + :param auth_settings: Authentication setting identifiers list. + :resource_path: A string representation of the HTTP request resource path. + :method: A string representation of the HTTP request method. + :body: A object representing the body of the HTTP request. + The object type is the return value of sanitize_for_serialization(). + :param request_auth: if set, the provided settings will + override the token in the configuration. + """ + if not auth_settings: + return + + if request_auth: + self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) + else: + for auth in auth_settings: + auth_setting = self.configuration.auth_settings().get(auth) + if auth_setting: + self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) + + def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: + """Updates the request parameters based on a single auth_setting + + :param headers: Header parameters dict to be updated. + :param queries: Query parameters tuple list to be updated. + :resource_path: A string representation of the HTTP request resource path. + :method: A string representation of the HTTP request method. + :body: A object representing the body of the HTTP request. + The object type is the return value of sanitize_for_serialization(). + :param auth_setting: auth settings for the endpoint + """ + if auth_setting["in"] == "cookie": + headers["Cookie"] = auth_setting["value"] + elif auth_setting["in"] == "header": + if auth_setting["type"] != "http-signature": + headers[auth_setting["key"]] = auth_setting["value"] + elif auth_setting["in"] == "query": + queries.append((auth_setting["key"], auth_setting["value"])) + else: + raise ApiValueError("Authentication token must be in `query` or `header`") + + def __deserialize_file(self, response): + """Deserializes body to file + + Saves response body into a file in a temporary folder, + using the filename from the `Content-Disposition` header if provided. + + handle file downloading + save response body into a tmp file and return the instance + + :param response: RESTResponse. + :return: file path. + """ + fd, path = tempfile.mkstemp(dir=self.configuration.temp_folder_path) + os.close(fd) + os.remove(path) + + content_disposition = response.getheader("Content-Disposition") + if content_disposition: + m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) + assert m is not None, "Unexpected 'content-disposition' header value" + filename = m.group(1) + path = os.path.join(os.path.dirname(path), filename) + + with open(path, "wb") as f: + f.write(response.data) + + return path + + def __deserialize_primitive(self, data, klass): + """Deserializes string to primitive type. + + :param data: str. + :param klass: class literal. + + :return: int, long, float, str, bool. + """ + try: + return klass(data) + except UnicodeEncodeError: + return str(data) + except TypeError: + return data + + def __deserialize_object(self, value): + """Return an original value. + + :return: object. + """ + return value + + def __deserialize_date(self, string): + """Deserializes string to date. + + :param string: str. + :return: date. + """ + try: + return parse(string).date() + except ImportError: + return string + except ValueError: + raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) + + def __deserialize_datetime(self, string): + """Deserializes string to datetime. + + The string should be in iso8601 datetime format. + + :param string: str. + :return: datetime. + """ + try: + return parse(string) + except ImportError: + return string + except ValueError: + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) + + def __deserialize_enum(self, data, klass): + """Deserializes primitive type to enum. + + :param data: primitive type. + :param klass: class literal. + :return: enum value. + """ + try: + return klass(data) + except ValueError: + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) + + def __deserialize_model(self, data, klass): + """Deserializes list or dict to model. + + :param data: dict, list. + :param klass: class literal. + :return: model object. + """ + + return klass.from_dict(data) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py new file mode 100644 index 0000000..1ce1372 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -0,0 +1,20 @@ +"""API response object.""" + +from __future__ import annotations +from typing import Optional, Generic, Mapping, TypeVar +from pydantic import Field, StrictInt, StrictBytes, BaseModel + +T = TypeVar("T") + + +class ApiResponse(BaseModel, Generic[T]): + """ + API response object + """ + + status_code: StrictInt = Field(description="HTTP status code") + headers: Optional[Mapping[str, str]] = Field(None, description="HTTP headers") + data: T = Field(description="Deserialized data given the data type") + raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") + + model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py new file mode 100644 index 0000000..2e80369 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -0,0 +1,460 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import copy +import logging +from logging import FileHandler +import multiprocessing +import sys +from typing import Optional +import urllib3 + +import http.client as httplib + +JSON_SCHEMA_VALIDATION_KEYWORDS = { + "multipleOf", + "maximum", + "exclusiveMaximum", + "minimum", + "exclusiveMinimum", + "maxLength", + "minLength", + "pattern", + "maxItems", + "minItems", +} + + +class Configuration: + """This class contains various settings of the API client. + + :param host: Base url. + :param ignore_operation_servers + Boolean to ignore operation servers for the API client. + Config will use `host` as the base url regardless of the operation servers. + :param api_key: Dict to store API key(s). + Each entry in the dict specifies an API key. + The dict key is the name of the security scheme in the OAS specification. + The dict value is the API key secret. + :param api_key_prefix: Dict to store API prefix (e.g. Bearer). + The dict key is the name of the security scheme in the OAS specification. + The dict value is an API key prefix when generating the auth data. + :param username: Username for HTTP basic authentication. + :param password: Password for HTTP basic authentication. + :param access_token: Access token. + :param server_index: Index to servers configuration. + :param server_variables: Mapping with string values to replace variables in + templated server configuration. The validation of enums is performed for + variables with defined enum values before. + :param server_operation_index: Mapping from operation ID to an index to server + configuration. + :param server_operation_variables: Mapping from operation ID to a mapping with + string values to replace variables in templated server configuration. + The validation of enums is performed for variables with defined enum + values before. + :param ssl_ca_cert: str - the path to a file of concatenated CA certificates + in PEM format. + :param retries: Number of retries for API requests. + + """ + + _default = None + + def __init__( + self, + host=None, + api_key=None, + api_key_prefix=None, + username=None, + password=None, + access_token=None, + server_index=None, + server_variables=None, + server_operation_index=None, + server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor""" + self._base_path = "http://localhost" if host is None else host + """Default Base url + """ + self.server_index = 0 if server_index is None and host is None else server_index + self.server_operation_index = server_operation_index or {} + """Default server index + """ + self.server_variables = server_variables or {} + self.server_operation_variables = server_operation_variables or {} + """Default server variables + """ + self.ignore_operation_servers = ignore_operation_servers + """Ignore operation servers + """ + self.temp_folder_path = None + """Temp file folder for downloading files + """ + # Authentication Settings + self.api_key = {} + if api_key: + self.api_key = api_key + """dict to store API key(s) + """ + self.api_key_prefix = {} + if api_key_prefix: + self.api_key_prefix = api_key_prefix + """dict to store API prefix (e.g. Bearer) + """ + self.refresh_api_key_hook = None + """function hook to refresh API key if expired + """ + self.username = username + """Username for HTTP basic authentication + """ + self.password = password + """Password for HTTP basic authentication + """ + self.access_token = access_token + """Access token + """ + self.logger = {} + """Logging Settings + """ + self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") + self.logger["urllib3_logger"] = logging.getLogger("urllib3") + self.logger_format = "%(asctime)s %(levelname)s %(message)s" + """Log format + """ + self.logger_stream_handler = None + """Log stream handler + """ + self.logger_file_handler: Optional[FileHandler] = None + """Log file handler + """ + self.logger_file = None + """Debug file location + """ + if debug is not None: + self.debug = debug + else: + self.__debug = False + """Debug switch + """ + + self.verify_ssl = True + """SSL/TLS verification + Set this to false to skip verifying SSL certificate when calling API + from https server. + """ + self.ssl_ca_cert = ssl_ca_cert + """Set this to customize the certificate file to verify the peer. + """ + self.cert_file = None + """client certificate file + """ + self.key_file = None + """client key file + """ + self.assert_hostname = None + """Set this to True/False to enable/disable SSL hostname verification. + """ + self.tls_server_name = None + """SSL/TLS Server Name Indication (SNI) + Set this to the SNI value expected by the server. + """ + + self.connection_pool_maxsize = multiprocessing.cpu_count() * 5 + """urllib3 connection pool's maximum number of connections saved + per pool. urllib3 uses 1 connection as default value, but this is + not the best value when you are making a lot of possibly parallel + requests to the same host, which is often the case here. + cpu_count * 5 is used as default value to increase performance. + """ + + self.proxy: Optional[str] = None + """Proxy URL + """ + self.proxy_headers = None + """Proxy headers + """ + self.safe_chars_for_path_param = "" + """Safe chars for path_param + """ + self.retries = retries + """Adding retries to override urllib3 default value 3 + """ + # Enable client side validation + self.client_side_validation = True + + self.socket_options = None + """Options to pass down to the underlying urllib3 socket + """ + + self.datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" + """datetime format + """ + + self.date_format = "%Y-%m-%d" + """date format + """ + + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + if k not in ("logger", "logger_file_handler"): + setattr(result, k, copy.deepcopy(v, memo)) + # shallow copy of loggers + result.logger = copy.copy(self.logger) + # use setters to configure loggers + result.logger_file = self.logger_file + result.debug = self.debug + return result + + def __setattr__(self, name, value): + object.__setattr__(self, name, value) + + @classmethod + def set_default(cls, default): + """Set default instance of configuration. + + It stores default configuration, which can be + returned by get_default_copy method. + + :param default: object of Configuration + """ + cls._default = default + + @classmethod + def get_default_copy(cls): + """Deprecated. Please use `get_default` instead. + + Deprecated. Please use `get_default` instead. + + :return: The configuration object. + """ + return cls.get_default() + + @classmethod + def get_default(cls): + """Return the default configuration. + + This method returns newly created, based on default constructor, + object of Configuration class or returns a copy of default + configuration. + + :return: The configuration object. + """ + if cls._default is None: + cls._default = Configuration() + return cls._default + + @property + def logger_file(self): + """The logger file. + + If the logger_file is None, then add stream handler and remove file + handler. Otherwise, add file handler and remove stream handler. + + :param value: The logger_file path. + :type: str + """ + return self.__logger_file + + @logger_file.setter + def logger_file(self, value): + """The logger file. + + If the logger_file is None, then add stream handler and remove file + handler. Otherwise, add file handler and remove stream handler. + + :param value: The logger_file path. + :type: str + """ + self.__logger_file = value + if self.__logger_file: + # If set logging file, + # then add file handler and remove stream handler. + self.logger_file_handler = logging.FileHandler(self.__logger_file) + self.logger_file_handler.setFormatter(self.logger_formatter) + for _, logger in self.logger.items(): + logger.addHandler(self.logger_file_handler) + + @property + def debug(self): + """Debug status + + :param value: The debug status, True or False. + :type: bool + """ + return self.__debug + + @debug.setter + def debug(self, value): + """Debug status + + :param value: The debug status, True or False. + :type: bool + """ + self.__debug = value + if self.__debug: + # if debug status is True, turn on debug logging + for _, logger in self.logger.items(): + logger.setLevel(logging.DEBUG) + # turn on httplib debug + httplib.HTTPConnection.debuglevel = 1 + else: + # if debug status is False, turn off debug logging, + # setting log level to default `logging.WARNING` + for _, logger in self.logger.items(): + logger.setLevel(logging.WARNING) + # turn off httplib debug + httplib.HTTPConnection.debuglevel = 0 + + @property + def logger_format(self): + """The logger format. + + The logger_formatter will be updated when sets logger_format. + + :param value: The format string. + :type: str + """ + return self.__logger_format + + @logger_format.setter + def logger_format(self, value): + """The logger format. + + The logger_formatter will be updated when sets logger_format. + + :param value: The format string. + :type: str + """ + self.__logger_format = value + self.logger_formatter = logging.Formatter(self.__logger_format) + + def get_api_key_with_prefix(self, identifier, alias=None): + """Gets API key (with prefix if set). + + :param identifier: The identifier of apiKey. + :param alias: The alternative identifier of apiKey. + :return: The token for api key authentication. + """ + if self.refresh_api_key_hook is not None: + self.refresh_api_key_hook(self) + key = self.api_key.get(identifier, self.api_key.get(alias) if alias is not None else None) + if key: + prefix = self.api_key_prefix.get(identifier) + if prefix: + return "%s %s" % (prefix, key) + else: + return key + + def get_basic_auth_token(self): + """Gets HTTP basic authentication header (string). + + :return: The token for basic HTTP authentication. + """ + username = "" + if self.username is not None: + username = self.username + password = "" + if self.password is not None: + password = self.password + return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") + + def auth_settings(self): + """Gets Auth Settings dict for api client. + + :return: The Auth Settings information dict. + """ + auth = {} + return auth + + def to_debug_report(self): + """Gets the essential information for debugging. + + :return: The report for debugging. + """ + return ( + "Python SDK Debug Report:\n" + "OS: {env}\n" + "Python Version: {pyversion}\n" + "Version of the API: 1.0.0\n" + "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) + ) + + def get_host_settings(self): + """Gets an array of host settings + + :return: An array of host settings + """ + return [ + { + "url": "", + "description": "No description provided", + } + ] + + def get_host_from_settings(self, index, variables=None, servers=None): + """Gets host URL based on the index and variables + :param index: array index of the host settings + :param variables: hash of variable and the corresponding value + :param servers: an array of host settings or None + :return: URL based on host settings + """ + if index is None: + return self._base_path + + variables = {} if variables is None else variables + servers = self.get_host_settings() if servers is None else servers + + try: + server = servers[index] + except IndexError: + raise ValueError( + "Invalid index {0} when selecting the host settings. " + "Must be less than {1}".format(index, len(servers)) + ) + + url = server["url"] + + # go through variables and replace placeholders + for variable_name, variable in server.get("variables", {}).items(): + used_value = variables.get(variable_name, variable["default_value"]) + + if "enum_values" in variable and used_value not in variable["enum_values"]: + raise ValueError( + "The variable `{0}` in the host URL has invalid value " + "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) + ) + + url = url.replace("{" + variable_name + "}", used_value) + + return url + + @property + def host(self): + """Return generated host.""" + return self.get_host_from_settings(self.server_index, variables=self.server_variables) + + @host.setter + def host(self, value): + """Fix base path.""" + self._base_path = value + self.server_index = None diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py new file mode 100644 index 0000000..5dbd4b0 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -0,0 +1,197 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + +from typing import Any, Optional +from typing_extensions import Self + + +class OpenApiException(Exception): + """The base exception class for all OpenAPIExceptions""" + + +class ApiTypeError(OpenApiException, TypeError): + def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: + """Raises an exception for TypeErrors + + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (list): a list of keys an indices to get to the + current_item + None if unset + valid_classes (tuple): the primitive classes that current item + should be an instance of + None if unset + key_type (bool): False if our value is a value in a dict + True if it is a key in a dict + False if our item is an item in a list + None if unset + """ + self.path_to_item = path_to_item + self.valid_classes = valid_classes + self.key_type = key_type + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiTypeError, self).__init__(full_msg) + + +class ApiValueError(OpenApiException, ValueError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (list) the path to the exception in the + received_data dict. None if unset + """ + + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiValueError, self).__init__(full_msg) + + +class ApiAttributeError(OpenApiException, AttributeError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Raised when an attribute reference or assignment fails. + + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (None/list) the path to the exception in the + received_data dict + """ + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiAttributeError, self).__init__(full_msg) + + +class ApiKeyError(OpenApiException, KeyError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (None/list) the path to the exception in the + received_data dict + """ + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiKeyError, self).__init__(full_msg) + + +class ApiException(OpenApiException): + + def __init__( + self, + status=None, + reason=None, + http_resp=None, + *, + body: Optional[str] = None, + data: Optional[Any] = None, + ) -> None: + self.status = status + self.reason = reason + self.body = body + self.data = data + self.headers = None + + if http_resp: + if self.status is None: + self.status = http_resp.status + if self.reason is None: + self.reason = http_resp.reason + if self.body is None: + try: + self.body = http_resp.data.decode("utf-8") + except Exception: + pass + self.headers = http_resp.getheaders() + + @classmethod + def from_response( + cls, + *, + http_resp, + body: Optional[str], + data: Optional[Any], + ) -> Self: + if http_resp.status == 400: + raise BadRequestException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 401: + raise UnauthorizedException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 403: + raise ForbiddenException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 404: + raise NotFoundException(http_resp=http_resp, body=body, data=data) + + if 500 <= http_resp.status <= 599: + raise ServiceException(http_resp=http_resp, body=body, data=data) + raise ApiException(http_resp=http_resp, body=body, data=data) + + def __str__(self): + """Custom error messages for exception""" + error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) + if self.headers: + error_message += "HTTP response headers: {0}\n".format(self.headers) + + if self.data or self.body: + error_message += "HTTP response body: {0}\n".format(self.data or self.body) + + return error_message + + +class BadRequestException(ApiException): + pass + + +class NotFoundException(ApiException): + pass + + +class UnauthorizedException(ApiException): + pass + + +class ForbiddenException(ApiException): + pass + + +class ServiceException(ApiException): + pass + + +def render_path(path_to_item): + """Returns a string representation of a path""" + result = "" + for pth in path_to_item: + if isinstance(pth, int): + result += "[{0}]".format(pth) + else: + result += "['{0}']".format(pth) + return result diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py new file mode 100644 index 0000000..ad02f00 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -0,0 +1,21 @@ +# coding: utf-8 + +# flake8: noqa +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +# import models into model package +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py new file mode 100644 index 0000000..da4408d --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -0,0 +1,103 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set +from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + _items = [] + if self.confluence_kwargs: + for _item_confluence_kwargs in self.confluence_kwargs: + if _item_confluence_kwargs: + _items.append(_item_confluence_kwargs.to_dict()) + _dict["confluence_kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "confluence_kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] + if obj.get("confluence_kwargs") is not None + else None + ), + "type": obj.get("type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py new file mode 100644 index 0000000..8bcfb3c --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -0,0 +1,82 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from typing import Optional, Set +from typing_extensions import Self + + +class ExtractionRequest(BaseModel): + """ """ # noqa: E501 + + path_on_s3: StrictStr + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of ExtractionRequest from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of ExtractionRequest from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py similarity index 94% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index 99c3ee2..a428183 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -19,8 +19,8 @@ from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List -from admin_api_lib.extractor_api_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair from typing import Optional, Set from typing_extensions import Self diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py new file mode 100644 index 0000000..60fc660 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -0,0 +1,209 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import io +import json +import re +import ssl + +import urllib3 + +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError + +SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} +RESTResponseType = urllib3.HTTPResponse + + +def is_socks_proxy_url(url): + if url is None: + return False + split_section = url.split("://") + if len(split_section) < 2: + return False + else: + return split_section[0].lower() in SUPPORTED_SOCKS_PROXIES + + +class RESTResponse(io.IOBase): + + def __init__(self, resp) -> None: + self.response = resp + self.status = resp.status + self.reason = resp.reason + self.data = None + + def read(self): + if self.data is None: + self.data = self.response.data + return self.data + + def getheaders(self): + """Returns a dictionary of the response headers.""" + return self.response.headers + + def getheader(self, name, default=None): + """Returns a given response header.""" + return self.response.headers.get(name, default) + + +class RESTClientObject: + + def __init__(self, configuration) -> None: + # urllib3.PoolManager will pass all kw parameters to connectionpool + # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 + # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/connectionpool.py#L680 # noqa: E501 + # Custom SSL certificates and client certificates: http://urllib3.readthedocs.io/en/latest/advanced-usage.html # noqa: E501 + + # cert_reqs + if configuration.verify_ssl: + cert_reqs = ssl.CERT_REQUIRED + else: + cert_reqs = ssl.CERT_NONE + + pool_args = { + "cert_reqs": cert_reqs, + "ca_certs": configuration.ssl_ca_cert, + "cert_file": configuration.cert_file, + "key_file": configuration.key_file, + } + if configuration.assert_hostname is not None: + pool_args["assert_hostname"] = configuration.assert_hostname + + if configuration.retries is not None: + pool_args["retries"] = configuration.retries + + if configuration.tls_server_name: + pool_args["server_hostname"] = configuration.tls_server_name + + if configuration.socket_options is not None: + pool_args["socket_options"] = configuration.socket_options + + if configuration.connection_pool_maxsize is not None: + pool_args["maxsize"] = configuration.connection_pool_maxsize + + # https pool manager + self.pool_manager: urllib3.PoolManager + + if configuration.proxy: + if is_socks_proxy_url(configuration.proxy): + from urllib3.contrib.socks import SOCKSProxyManager + + pool_args["proxy_url"] = configuration.proxy + pool_args["headers"] = configuration.proxy_headers + self.pool_manager = SOCKSProxyManager(**pool_args) + else: + pool_args["proxy_url"] = configuration.proxy + pool_args["proxy_headers"] = configuration.proxy_headers + self.pool_manager = urllib3.ProxyManager(**pool_args) + else: + self.pool_manager = urllib3.PoolManager(**pool_args) + + def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): + """Perform requests. + + :param method: http request method + :param url: http request url + :param headers: http request headers + :param body: request json body, for `application/json` + :param post_params: request post parameters, + `application/x-www-form-urlencoded` + and `multipart/form-data` + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + """ + method = method.upper() + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] + + if post_params and body: + raise ApiValueError("body parameter cannot be used with post_params parameter.") + + post_params = post_params or {} + headers = headers or {} + + timeout = None + if _request_timeout: + if isinstance(_request_timeout, (int, float)): + timeout = urllib3.Timeout(total=_request_timeout) + elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: + timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) + + try: + # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` + if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: + + # no content type provided or payload is json + content_type = headers.get("Content-Type") + if not content_type or re.search("json", content_type, re.IGNORECASE): + request_body = None + if body is not None: + request_body = json.dumps(body) + r = self.pool_manager.request( + method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False + ) + elif content_type == "application/x-www-form-urlencoded": + r = self.pool_manager.request( + method, + url, + fields=post_params, + encode_multipart=False, + timeout=timeout, + headers=headers, + preload_content=False, + ) + elif content_type == "multipart/form-data": + # must del headers['Content-Type'], or the correct + # Content-Type which generated by urllib3 will be + # overwritten. + del headers["Content-Type"] + # Ensures that dict objects are serialized + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] + r = self.pool_manager.request( + method, + url, + fields=post_params, + encode_multipart=True, + timeout=timeout, + headers=headers, + preload_content=False, + ) + # Pass a `string` parameter directly in the body to support + # other content types than JSON when `body` argument is + # provided in serialized form. + elif isinstance(body, str) or isinstance(body, bytes): + r = self.pool_manager.request( + method, url, body=body, timeout=timeout, headers=headers, preload_content=False + ) + elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): + request_body = "true" if body else "false" + r = self.pool_manager.request( + method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers + ) + else: + # Cannot generate the request from given parameters + msg = """Cannot prepare a request message for provided + arguments. Please check that your arguments match + declared content type.""" + raise ApiException(status=0, reason=msg) + # For `GET`, `HEAD` + else: + r = self.pool_manager.request( + method, url, fields={}, timeout=timeout, headers=headers, preload_content=False + ) + except urllib3.exceptions.SSLError as e: + msg = "\n".join([type(e).__name__, str(e)]) + raise ApiException(status=0, reason=msg) + + return RESTResponse(r) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py new file mode 100644 index 0000000..5a78d9b --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -0,0 +1,35 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + + +class TestContentType(unittest.TestCase): + """ContentType unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testContentType(self): + """Test ContentType""" + # inst = ContentType() + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py new file mode 100644 index 0000000..9504ab4 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters + + +class TestExtractionParameters(unittest.TestCase): + """ExtractionParameters unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionParameters: + """Test ExtractionParameters + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionParameters` + """ + model = ExtractionParameters() + if include_optional: + return ExtractionParameters( + document_name = '', + confluence_kwargs = [ + {"value":"value","key":"key"} + ], + type = '' + ) + else: + return ExtractionParameters( + document_name = '', + type = '', + ) + """ + + def testExtractionParameters(self): + """Test ExtractionParameters""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py new file mode 100644 index 0000000..1401561 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + + +class TestExtractionRequest(unittest.TestCase): + """ExtractionRequest unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionRequest: + """Test ExtractionRequest + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionRequest` + """ + model = ExtractionRequest() + if include_optional: + return ExtractionRequest( + path_on_s3 = '', + document_name = '' + ) + else: + return ExtractionRequest( + path_on_s3 = '', + document_name = '', + ) + """ + + def testExtractionRequest(self): + """Test ExtractionRequest""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py new file mode 100644 index 0000000..975a7bf --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -0,0 +1,39 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + + +class TestExtractorApi(unittest.TestCase): + """ExtractorApi unit test stubs""" + + def setUp(self) -> None: + self.api = ExtractorApi() + + def tearDown(self) -> None: + pass + + def test_extract_from_file_post(self) -> None: + """Test case for extract_from_file_post""" + pass + + def test_extract_from_source(self) -> None: + """Test case for extract_from_source""" + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py new file mode 100644 index 0000000..479c858 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -0,0 +1,62 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + + +class TestInformationPiece(unittest.TestCase): + """InformationPiece unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> InformationPiece: + """Test InformationPiece + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `InformationPiece` + """ + model = InformationPiece() + if include_optional: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE' + ) + else: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE', + ) + """ + + def testInformationPiece(self): + """Test InformationPiece""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py new file mode 100644 index 0000000..0ddc864 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -0,0 +1,54 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + + +class TestKeyValuePair(unittest.TestCase): + """KeyValuePair unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> KeyValuePair: + """Test KeyValuePair + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `KeyValuePair` + """ + model = KeyValuePair() + if include_optional: + return KeyValuePair( + key = None, + value = None + ) + else: + return KeyValuePair( + ) + """ + + def testKeyValuePair(self): + """Test KeyValuePair""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 3adbae1..2a0f678 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -99,7 +99,6 @@ async def upload_source( ) -> None: await source_uploader.upload_source(str(request.base_url), type, name, kwargs) - @inject async def upload_file( self, @@ -109,7 +108,6 @@ async def upload_file( ) -> None: await file_uploader.upload_source(str(request.base_url), file) - @inject async def document_reference_id_get( self, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 0dd5b4f..89d432c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -56,8 +56,7 @@ async def upload_source( file: UploadFile, ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] - - + try: content = await file.read() file.filename = sanitize_document_name(file.filename) @@ -65,10 +64,10 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status + ) # TODO: change to pipeline with timeout to error status s3_path = await self._asave_new_document(content, file.filename, source_name) thread = Thread( - target=lambda: run(self._handle_source_upload(s3_path,source_name, file.filename, base_url)) + target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) ) thread.start() self._background_threads.append(thread) @@ -82,10 +81,10 @@ async def upload_source( async def _handle_source_upload( self, - s3_path:Path, + s3_path: Path, source_name: str, - file_name:str, - base_url: str, + file_name: str, + base_url: str, ): try: information_pieces = self._extractor_api.extract(s3_path, source_name) @@ -98,11 +97,11 @@ async def _handle_source_upload( chunked_documents = self._chunker.chunk(documents) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - self._add_file_url(file_name,base_url,enhanced_documents) + self._add_file_url(file_name, base_url, enhanced_documents) rag_information_pieces = [ self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] + ] # Replace old document try: await self._document_deleter.adelete_document(source_name) @@ -116,9 +115,7 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - def _add_file_url( - self, file: UploadFile, base_url: str, chunked_documents: list[Document] - ): + def _add_file_url(self, file: UploadFile, base_url: str, chunked_documents: list[Document]): document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" for idx, chunk in enumerate(chunked_documents): if chunk.metadata["id"] in chunk.metadata["related"]: @@ -135,8 +132,8 @@ async def _asave_new_document( self, file_content: bytes, filename: str, - source_name:str, - )->Path: + source_name: str, + ) -> Path: try: with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = Path(temp_dir) / filename diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 5ef6a72..81df19c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -61,10 +61,8 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs)) - ) + ) # TODO: change to pipeline with timeout to error status + thread = Thread(target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs))) thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index 81ca3e2..ebfad6c 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -5,12 +5,12 @@ info: servers: - url: / paths: - /extract: + /extract_from_file: post: - operationId: extract + operationId: extract_from_file_post requestBody: content: - multipart/form-data: + application/json: schema: $ref: '#/components/schemas/extraction_request' required: true @@ -24,13 +24,56 @@ paths: type: array description: List of extracted information. "422": - description: Body is not a valid source. + description: Body is not a valid PDF. "500": description: Something somewhere went terribly wrong. tags: - extractor + /extract_from_source: + post: + operationId: extract_from_source + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_parameters' + required: true + responses: + "200": + content: + application/json: + schema: + items: + $ref: '#/components/schemas/information_piece' + type: array + description: ok + "404": + description: not found + "422": + description: unprocessable entity + "500": + description: internal server error + tags: + - extractor components: schemas: + extraction_request: + description: "" + example: + path_on_s3: path on s3 + properties: + path_on_s3: + description: "" + title: PathOnS3 + type: string + document_name: + description: "" + type: string + required: + - document_name + - path_on_s3 + title: ExtractionRequest + type: object key_value_pair: description: "" example: @@ -81,24 +124,26 @@ components: - type title: InformationPiece type: object - extraction_request: + extraction_parameters: description: "" properties: - file: - description: "" - type: file - type: - description: "" + document_name: + description: The name that will be used to store the confluence db in the + key value db and the vectordatabase (metadata.document). + title: document_name type: string - kwargs: - description: "" + confluence_kwargs: + description: Kwargs for the extractor items: $ref: '#/components/schemas/key_value_pair' + title: confluence_kwargs type: array - name: - description: "" + type: + description: Extractortype + title: type type: string required: - - name + - document_name - type + title: confluence_parameters type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 38c9a1d..47479f0 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Annotated, Dict, List # noqa: F401 +from typing import Dict, List # noqa: F401 import importlib import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import extractor_api_lib.impl +import openapi_server.impl from fastapi import ( # noqa: F401 APIRouter, @@ -23,36 +23,51 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from pydantic import StrictBytes, StrictStr -from fastapi import Request, Response, UploadFile -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() -ns_pkg = extractor_api_lib.impl +ns_pkg = openapi_server.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @router.post( - "/extract", + "/extract_from_file", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, - 422: {"description": "Body is not a valid source."}, + 422: {"description": "Body is not a valid PDF."}, 500: {"description": "Something somewhere went terribly wrong."}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract( - type: Annotated[str, Form()], - name: Annotated[str, Form()], - file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, +async def extract_from_file_post( + extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseExtractorApi.subclasses[0]().extract(type, name, file, kwargs) + return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) + + +@router.post( + "/extract_from_source", + responses={ + 200: {"model": List[InformationPiece], "description": "ok"}, + 404: {"description": "not found"}, + 422: {"description": "unprocessable entity"}, + 500: {"description": "internal server error"}, + }, + tags=["extractor"], + response_model_by_alias=True, +) +async def extract_from_source( + extraction_parameters: ExtractionParameters = Body(None, description=""), +) -> List[InformationPiece]: + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index f7a7cf0..b1bac98 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,11 +2,10 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from pydantic import StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union -from fastapi import Request, Response, UploadFile +from typing import Any, List +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair class BaseExtractorApi: @@ -16,10 +15,12 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - async def extract( + async def extract_from_file_post( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + extraction_request: ExtractionRequest, + ) -> List[InformationPiece]: ... + + async def extract_from_source( + self, + extraction_parameters: ExtractionParameters, ) -> List[InformationPiece]: ... diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 553d79a..e9602d4 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -45,7 +45,7 @@ async def aextract_content(self, file_path: Path, name: str) -> list[InternalInf Path to the file the information should be extracted from. name : str Name of the document. - + Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py new file mode 100644 index 0000000..3aed2ca --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -0,0 +1,105 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from extractor_api_lib.models.key_value_pair import KeyValuePair + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + _items = [] + if self.confluence_kwargs: + for _item in self.confluence_kwargs: + if _item: + _items.append(_item.to_dict()) + _dict["confluence_kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "confluence_kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("confluence_kwargs")] + if obj.get("confluence_kwargs") is not None + else None + ), + "type": obj.get("type"), + } + ) + return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 8917378..769b658 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -18,9 +18,8 @@ import json -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from extractor_api_lib.models.key_value_pair import KeyValuePair +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -31,10 +30,9 @@ class ExtractionRequest(BaseModel): """ """ # noqa: E501 - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None - type: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + path_on_s3: StrictStr + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] model_config = { "populate_by_name": True, @@ -71,13 +69,6 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item in self.kwargs: - if _item: - _items.append(_item.to_dict()) - _dict["kwargs"] = _items return _dict @classmethod @@ -89,15 +80,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] - if obj.get("kwargs") is not None - else None - ), - } - ) + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) return _obj From f10aa41bbad95925adf37393f20fea0c8a63e959 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 14:01:33 +0200 Subject: [PATCH 11/43] wip --- .../src/admin_api_lib/dependency_container.py | 24 +++++++++++++++++-- .../models/extraction_parameters.py | 24 +++++++++---------- .../api_endpoints/default_file_uploader.py | 9 ++++--- .../api_endpoints/default_source_uploader.py | 9 ++++--- extractor-api-lib/openapi.yaml | 10 ++++---- .../models/extraction_parameters.py | 20 ++++++++-------- 6 files changed, 61 insertions(+), 35 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 93b3ab2..2ae6a1d 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,5 +1,6 @@ """Module for the DependencyContainer class.""" +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -11,7 +12,13 @@ from langchain_community.llms import Ollama, VLLMOpenAI from langfuse import Langfuse -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( + ExtractorApi, +) +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import ( + Configuration as ExtractorConfiguration, +) from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.impl.api_endpoints.default_document_deleter import ( DefaultDocumentDeleter, @@ -87,7 +94,9 @@ class DependencyContainer(DeclarativeContainer): ) chunker = Singleton(TextChunker, text_splitter) - document_extractor = Singleton(ExtractorApiClient, document_extractor_settings.host) + extractor_api_configuration = Singleton(ExtractorConfiguration, host=document_extractor_settings.host) + document_extractor_api_client = Singleton(ApiClient, extractor_api_configuration) + document_extractor = Singleton(ExtractorApi, document_extractor_api_client) rag_api_configuration = Singleton(RagConfiguration, host=rag_api_settings.host) rag_api_client = Singleton(RagApiClient, configuration=rag_api_configuration) @@ -159,3 +168,14 @@ class DependencyContainer(DeclarativeContainer): key_value_store=key_value_store, document_deleter=document_deleter, ) + + file_uploader = Singleton( + DefaultFileUploader, + extractor_api=document_extractor, + rag_api=rag_api, + information_enhancer=information_enhancer, + information_mapper=information_mapper, + chunker=chunker, + key_value_store=key_value_store, + document_deleter=document_deleter, + ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py index da4408d..37db1e8 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -30,9 +30,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") type: StrictStr = Field(description="Extractortype") - __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] model_config = ConfigDict( populate_by_name=True, @@ -71,13 +71,13 @@ def to_dict(self) -> Dict[str, Any]: exclude=excluded_fields, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) _items = [] - if self.confluence_kwargs: - for _item_confluence_kwargs in self.confluence_kwargs: - if _item_confluence_kwargs: - _items.append(_item_confluence_kwargs.to_dict()) - _dict["confluence_kwargs"] = _items + if self.kwargs: + for _item_kwargs in self.kwargs: + if _item_kwargs: + _items.append(_item_kwargs.to_dict()) + _dict["kwargs"] = _items return _dict @classmethod @@ -92,12 +92,12 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] - if obj.get("confluence_kwargs") is not None + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] + if obj.get("kwargs") is not None else None ), - "type": obj.get("type"), } ) return _obj diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 89d432c..124d895 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -9,6 +9,8 @@ import tempfile from urllib.request import Request +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from extractor_api_lib.models.extraction_request import ExtractionRequest from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document @@ -21,7 +23,6 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -33,7 +34,7 @@ class DefaultFileUploader(FileUploader): def __init__( self, - extractor_api: ExtractorApiClient, + extractor_api: ExtractorApi, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -87,7 +88,9 @@ async def _handle_source_upload( base_url: str, ): try: - information_pieces = self._extractor_api.extract(s3_path, source_name) + information_pieces = self._extractor_api.extract_from_file_post( + ExtractionRequest(path_on_s3=s3_path, document_name=source_name) + ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 81df19c..1637595 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -7,6 +7,8 @@ import urllib import tempfile +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document @@ -19,7 +21,6 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -31,7 +32,7 @@ class DefaultSourceUploader(SourceUploader): def __init__( self, - extractor_api: ExtractorApiClient, + extractor_api: ExtractorApi, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -81,7 +82,9 @@ async def _handle_source_upload( kwargs: list[KeyValuePair], ): try: - information_pieces = self._extractor_api.extract(type, source_name, kwargs) + information_pieces = self._extractor_api.extract_from_source( + ExtractionParameters(type=type, document_name=source_name, kwargs=kwargs) + ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index ebfad6c..d178a86 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -132,16 +132,16 @@ components: key value db and the vectordatabase (metadata.document). title: document_name type: string - confluence_kwargs: + type: + description: Extractortype + title: type + type: string + kwargs: description: Kwargs for the extractor items: $ref: '#/components/schemas/key_value_pair' title: confluence_kwargs type: array - type: - description: Extractortype - title: type - type: string required: - document_name - type diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index 3aed2ca..d701978 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -34,9 +34,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") type: StrictStr = Field(description="Extractortype") - __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] model_config = { "populate_by_name": True, @@ -73,13 +73,13 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) _items = [] - if self.confluence_kwargs: - for _item in self.confluence_kwargs: + if self.kwargs: + for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict["confluence_kwargs"] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -94,12 +94,12 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("confluence_kwargs")] - if obj.get("confluence_kwargs") is not None + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None else None ), - "type": obj.get("type"), } ) return _obj From 96e53e7d5a2bab796d626956a0ecac38e5ab25e9 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:14:41 +0200 Subject: [PATCH 12/43] fix --- .../src/admin_api_lib/dependency_container.py | 1 + .../src/admin_api_lib/impl/admin_api.py | 12 ++-- .../api_endpoints/default_file_uploader.py | 23 ++++--- .../api_endpoints/default_source_uploader.py | 8 +-- .../impl/mapper/informationpiece2document.py | 4 +- .../api_endpoints/file_extractor.py | 23 +++++++ .../{extractor.py => source_extractor.py} | 14 ++--- .../extractor_api_lib/apis/extractor_api.py | 4 +- .../extractor_api_lib/dependency_container.py | 12 ++-- .../extractors/information_extractor.py | 12 ++-- .../general_file_extractor.py | 38 ++++-------- ...tractor.py => general_source_extractor.py} | 14 ++--- .../impl/extractor_api_impl.py | 38 +++++------- .../impl/extractors/confluence_extractor.py | 10 ++- ...ce_langchain_document2information_piece.py | 62 ++++--------------- 15 files changed, 120 insertions(+), 155 deletions(-) create mode 100644 extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py rename extractor-api-lib/src/extractor_api_lib/api_endpoints/{extractor.py => source_extractor.py} (60%) rename extractor-api-lib/src/extractor_api_lib/impl/{extractors => api_endpoints}/general_file_extractor.py (71%) rename extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/{default_extractor.py => general_source_extractor.py} (87%) diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 2ae6a1d..640ea72 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -178,4 +178,5 @@ class DependencyContainer(DeclarativeContainer): chunker=chunker, key_value_store=key_value_store, document_deleter=document_deleter, + file_service=file_service, ) diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 2a0f678..b05d7d7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -2,15 +2,15 @@ import logging from typing import List, Optional -from pydantic import Field, StrictBytes, StrictStr -from admin_api_lib.api_endpoints.source_uploader import SourceUploader -from admin_api_lib.models.key_value_pair import KeyValuePair -from admin_api_lib.models.upload_source import UploadSource + +from pydantic import Field, StrictBytes, StrictStr from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile - +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter from admin_api_lib.api_endpoints.document_reference_retriever import ( DocumentReferenceRetriever, @@ -106,7 +106,7 @@ async def upload_file( request: Request, file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), ) -> None: - await file_uploader.upload_source(str(request.base_url), file) + await file_uploader.upload_file(str(request.base_url), file) @inject async def document_reference_id_get( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 124d895..703e3b8 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -9,18 +9,21 @@ import tempfile from urllib.request import Request -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi -from extractor_api_lib.models.extraction_request import ExtractionRequest + + + +from admin_api_lib.file_services.file_service import FileService from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run -from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore @@ -41,6 +44,7 @@ def __init__( document_deleter: DocumentDeleter, rag_api: RagApi, information_mapper: InformationPiece2Document, + file_service: FileService, ): self._extractor_api = extractor_api self._rag_api = rag_api @@ -50,8 +54,9 @@ def __init__( self._chunker = chunker self._document_deleter = document_deleter self._background_threads = [] + self._file_service = file_service - async def upload_source( + async def upload_file( self, base_url: str, file: UploadFile, @@ -89,7 +94,7 @@ async def _handle_source_upload( ): try: information_pieces = self._extractor_api.extract_from_file_post( - ExtractionRequest(path_on_s3=s3_path, document_name=source_name) + ExtractionRequest(path_on_s3=str(s3_path), document_name=source_name) ) if not information_pieces: @@ -118,8 +123,8 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - def _add_file_url(self, file: UploadFile, base_url: str, chunked_documents: list[Document]): - document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]): + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}" for idx, chunk in enumerate(chunked_documents): if chunk.metadata["id"] in chunk.metadata["related"]: chunk.metadata["related"].remove(chunk.metadata["id"]) @@ -146,7 +151,7 @@ async def _asave_new_document( logger.debug("Temp file created and content written.") self._file_service.upload_file(Path(temp_file_path), filename) - return Path(temp_file_path) + return filename except Exception as e: logger.error("Error during document saving: %s %s", e, traceback.format_exc()) self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1637595..deb8cac 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -6,14 +6,13 @@ from threading import Thread import urllib import tempfile - -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document @@ -79,11 +78,12 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, + name:str, kwargs: list[KeyValuePair], ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=type, document_name=source_name, kwargs=kwargs) + ExtractionParameters(type=type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py index 6f0ac2f..a3a40ce 100644 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py +++ b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py @@ -4,10 +4,10 @@ from langchain_core.documents import Document as LangchainDocument -from admin_api_lib.extractor_api_client.models.content_type import ( +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( ContentType as ExtractorInformaType, ) -from admin_api_lib.extractor_api_client.models.information_piece import ( +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( InformationPiece as ExtractorInformationPiece, ) from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py new file mode 100644 index 0000000..499a09d --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from extractor_api_lib.models.extraction_request import ExtractionRequest +from extractor_api_lib.models.information_piece import InformationPiece + + +class FileExtractor(ABC): + """Abstract base class for extract_information endpoint.""" + + @abstractmethod + async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: + """ + Extract information of a document, given by the extraction_request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request containing the details of the document to be processed for information extraction. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces from the document. + """ \ No newline at end of file diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py similarity index 60% rename from extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py rename to extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index c3f254b..44b5c38 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from fastapi import UploadFile @@ -8,23 +9,20 @@ from extractor_api_lib.models.key_value_pair import KeyValuePair -class Extractor(ABC): +class SourceExtractor(ABC): @abstractmethod async def aextract_information( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ - Extract information from confluence, using the given confluence parameters. + Extract information from source, using the given parameters. Parameters ---------- - confluence_parameters : ConfluenceParameters - The parameters used to extract information from Confluence. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 47479f0..fc3d0ee 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -5,7 +5,7 @@ import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import openapi_server.impl +import extractor_api_lib.impl from fastapi import ( # noqa: F401 APIRouter, @@ -31,7 +31,7 @@ router = APIRouter() -ns_pkg = openapi_server.impl +ns_pkg = extractor_api_lib.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index 2c5c53f..a4adfe0 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -3,12 +3,12 @@ from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import List, Singleton # noqa: WOT001 -from extractor_api_lib.impl.api_endpoints.default_extractor import DefaultExtractor +from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor -from extractor_api_lib.impl.extractors.general_file_extractor import GeneralFileExtractor +from extractor_api_lib.impl.api_endpoints.general_file_extractor import GeneralFileExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -38,11 +38,11 @@ class DependencyContainer(DeclarativeContainer): langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors,intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - default_extractor = Singleton( - DefaultExtractor, + source_extractor = Singleton( + GeneralSourceExtractor, mapper=intern2external, - available_extractors=List(general_file_extractor, confluence_extractor), + available_extractors=List(confluence_extractor), ) diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index eeaadf1..92c71c3 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -4,6 +4,7 @@ from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from fastapi import UploadFile from pydantic import StrictStr @@ -23,18 +24,15 @@ def extractor_type(self) -> ExtractorTypes: ... @abstractmethod async def aextract_content( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ - Extract content from given file. + Extract content from source. Parameters ---------- - file_path : Path - Path to the file the information should be extracted from. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py similarity index 71% rename from extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 04abb2c..505431f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -7,9 +7,9 @@ from typing import Any, List, Optional -from pydantic import StrictStr -from fastapi import UploadFile - +from extractor_api_lib.api_endpoints.file_extractor import FileExtractor +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.extractors.information_extractor import InformationExtractor @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -class GeneralFileExtractor(InformationExtractor): +class GeneralFileExtractor(FileExtractor): """A class to extract information from documents using available extractors. This class serves as a general extractor that utilizes a list of available @@ -29,7 +29,7 @@ class GeneralFileExtractor(InformationExtractor): appropriate extractor based on the file type of the document. """ - def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor]): + def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor], mapper: Internal2ExternalInformationPiece): """ Initialize the GeneralExtractor. @@ -42,18 +42,9 @@ def __init__(self, file_service: FileService, available_extractors: list[Informa """ self._file_service = file_service self._available_extractors = available_extractors + self._mapper = mapper - @property - def extractor_type(self) -> ExtractorTypes: - return ExtractorTypes.FILE - - async def aextract_content( - self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], - ) -> list[InternalInformationPiece]: + async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: """ Extract content from given file. @@ -66,25 +57,22 @@ async def aextract_content( ------- list[InformationPiece] The extracted information. - """ - # save file on s3 - content = await file.read() - filename = file.filename + """ try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / filename + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / Path(extraction_request.path_on_s3).name with open(temp_file_path, "wb") as temp_file: + self._file_service.download_file(extraction_request.path_on_s3,temp_file) logger.debug("Temporary file created at %s.", temp_file_path) - temp_file.write(content) logger.debug("Temp file created and content written.") - self._file_service.upload_file(temp_file_path, filename) file_type = str(temp_file_path).split(".")[-1].upper() correct_extractors = [ x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - return await correct_extractors[-1].aextract_content(temp_file_path, name) + results = await correct_extractors[-1].aextract_content(temp_file_path, extraction_request.document_name) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) raise e diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py similarity index 87% rename from extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index b485c1e..7e135b6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -3,6 +3,7 @@ import logging from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from fastapi import UploadFile @@ -10,7 +11,7 @@ from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece -from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair @@ -21,7 +22,7 @@ logger = logging.getLogger(__name__) -class DefaultExtractor(Extractor): +class GeneralSourceExtractor(SourceExtractor): """A class to extract information from documents using available extractors. This class serves as a general extractor that utilizes a list of available @@ -43,10 +44,7 @@ def __init__(self, available_extractors: list[InformationExtractor], mapper: Int async def aextract_information( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ Extract content from given file. @@ -61,8 +59,8 @@ async def aextract_information( list[InformationPiece] The extracted information. """ - correct_extractors = [x for x in self._available_extractors if type == x.extractor_type] + correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] if not correct_extractors: raise ValueError(f"No extractor found for type {type}") - results = await correct_extractors[-1].aextract_content(type, name, file, kwargs) + results = await correct_extractors[-1].aextract_content(extraction_parameters) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index bfe9393..df8a59f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,7 +1,10 @@ """Module for the implementation of the ExtractorApi interface.""" from dependency_injector.wiring import Provide, inject -from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.api_endpoints.file_extractor import FileExtractor +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from fastapi import Depends, UploadFile from pydantic import StrictStr @@ -18,27 +21,16 @@ class ExtractorApiImpl(BaseExtractorApi): """Implementation of the ExtractorApi interface.""" @inject - async def extract( + async def extract_from_file_post( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], - extractor: Extractor = Depends(Provide[DependencyContainer.default_extractor]), - ) -> list[InformationPiece]: - """ - Extract information from a source. + extraction_request: ExtractionRequest, + extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), + ) -> list[InformationPiece]: + return await extractor.aextract_information(extraction_request) - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing details about the extraction process. - file_extractor : FileExtractor, optional - The file extractor dependency, by default Depends(Provide[DependencyContainer.file_extractor]). - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces. - """ - return await extractor.aextract_information(type, name, file, kwargs) + async def extract_from_source( + self, + extraction_parameters: ExtractionParameters, + extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), + ) -> list[InformationPiece]: + return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 1f7c666..faf9c4e 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -3,6 +3,7 @@ from typing import Optional from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader from fastapi import UploadFile @@ -40,10 +41,7 @@ def extractor_type(self) -> ExtractorTypes: async def aextract_content( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -59,10 +57,10 @@ async def aextract_content( A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict - confluence_loader_parameters = {x.key: x.value for x in kwargs} + confluence_loader_parameters = {x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs} # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None) document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() - return [self.mapper.map_document2informationpiece(x) for x in documents] + return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 96e6efe..77e5435 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -1,5 +1,6 @@ """Module for the ConfluenceLangchainDocument2InformationPiece class.""" +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from langchain_core.documents import Document as LangchainDocument from extractor_api_lib.models.confluence_parameters import ConfluenceParameters @@ -35,35 +36,7 @@ class ConfluenceLangchainDocument2InformationPiece: USE_CASE_RELATED_KEY = "related" DOCUMENT_KEY = "document" - def __init__(self) -> None: - """Initialize the ConfluenceLangchainDocument2InformationPiece instance.""" - self._confluence_parameters = None - - @property - def confluence_parameters(self): - """ - Property that returns the Confluence parameters. - - Returns - ------- - dict - A dictionary containing the Confluence parameters. - """ - return self._confluence_parameters - - @confluence_parameters.setter - def confluence_parameters(self, confluence_parameters: ConfluenceParameters): - """ - Set the confluence parameters. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The confluence parameters to be set. - """ - self._confluence_parameters = confluence_parameters - - def map_document2informationpiece(self, document: LangchainDocument) -> InformationPiece: + def map_document2informationpiece(self, document: LangchainDocument, document_name:str) -> InternalInformationPiece: """ Map a LangchainDocument to an InformationPiece. @@ -81,28 +54,19 @@ def map_document2informationpiece(self, document: LangchainDocument) -> Informat ------ ValueError If Confluence parameters are not set before mapping documents. - """ - if self._confluence_parameters is None: - raise ValueError("Confluence parameters must be set before mapping documents") + """ + meta = self._map_meta(document.metadata, document_name) + return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - meta = self._map_meta(document.metadata) - return InformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - - def _map_meta(self, internal: dict) -> list[MetaInformationPiece]: - metadata = [] + def _map_meta(self, internal: dict, document_name:str) -> dict: + metadata = {} for key, value in internal.items(): - metadata.append( - MetaInformationPiece( - key=self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key, - value=value, - ) - ) - page_title_matches = [m.value for m in metadata if m.key == self.CONFLUENCE_LOADER_TITLE_KEY] + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key]=value + + page_title_matches = [v for k,v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" - metadata.append(MetaInformationPiece(key=self.USER_CASE_PAGE_KEY, value=page_title)) - metadata.append( - MetaInformationPiece(key=self.DOCUMENT_KEY, value=self._confluence_parameters.document_name) - ) - metadata.append(MetaInformationPiece(key=self.USE_CASE_RELATED_KEY, value=[])) + metadata[self.USER_CASE_PAGE_KEY]=page_title + metadata[self.DOCUMENT_KEY]=document_name + metadata[self.USE_CASE_RELATED_KEY]=[] return metadata From a1f8feeb513025c8290a935d064c13442fbcdf80 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:15:34 +0200 Subject: [PATCH 13/43] black --- .../api_endpoints/default_file_uploader.py | 2 -- .../api_endpoints/default_source_uploader.py | 2 +- .../api_endpoints/file_extractor.py | 2 +- .../extractor_api_lib/dependency_container.py | 4 ++-- .../extractors/information_extractor.py | 2 +- .../api_endpoints/general_file_extractor.py | 17 ++++++++++++----- .../impl/extractor_api_impl.py | 4 ++-- .../impl/extractors/confluence_extractor.py | 6 ++++-- ...nce_langchain_document2information_piece.py | 18 ++++++++++-------- 9 files changed, 33 insertions(+), 24 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 703e3b8..37a8e28 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -10,8 +10,6 @@ from urllib.request import Request - - from admin_api_lib.file_services.file_service import FileService from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index deb8cac..ab1e153 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -78,7 +78,7 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, - name:str, + name: str, kwargs: list[KeyValuePair], ): try: diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py index 499a09d..ad968a2 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -20,4 +20,4 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ------- list[InformationPiece] A list of extracted information pieces from the document. - """ \ No newline at end of file + """ diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index a4adfe0..ad671d9 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -38,11 +38,11 @@ class DependencyContainer(DeclarativeContainer): langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors,intern2external) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) source_extractor = Singleton( GeneralSourceExtractor, mapper=intern2external, - available_extractors=List(confluence_extractor), + available_extractors=List(confluence_extractor), ) diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index 92c71c3..35952cf 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -24,7 +24,7 @@ def extractor_type(self) -> ExtractorTypes: ... @abstractmethod async def aextract_content( self, - extraction_parameters: ExtractionParameters, + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Extract content from source. diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 505431f..8ed9e8b 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -29,7 +29,12 @@ class GeneralFileExtractor(FileExtractor): appropriate extractor based on the file type of the document. """ - def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor], mapper: Internal2ExternalInformationPiece): + def __init__( + self, + file_service: FileService, + available_extractors: list[InformationFileExtractor], + mapper: Internal2ExternalInformationPiece, + ): """ Initialize the GeneralExtractor. @@ -57,12 +62,12 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ------- list[InformationPiece] The extracted information. - """ + """ try: - with tempfile.TemporaryDirectory() as temp_dir: + with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = Path(temp_dir) / Path(extraction_request.path_on_s3).name with open(temp_file_path, "wb") as temp_file: - self._file_service.download_file(extraction_request.path_on_s3,temp_file) + self._file_service.download_file(extraction_request.path_on_s3, temp_file) logger.debug("Temporary file created at %s.", temp_file_path) logger.debug("Temp file created and content written.") file_type = str(temp_file_path).split(".")[-1].upper() @@ -71,7 +76,9 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - results = await correct_extractors[-1].aextract_content(temp_file_path, extraction_request.document_name) + results = await correct_extractors[-1].aextract_content( + temp_file_path, extraction_request.document_name + ) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index df8a59f..50a8623 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -25,12 +25,12 @@ async def extract_from_file_post( self, extraction_request: ExtractionRequest, extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), - ) -> list[InformationPiece]: + ) -> list[InformationPiece]: return await extractor.aextract_information(extraction_request) async def extract_from_source( self, extraction_parameters: ExtractionParameters, extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), - ) -> list[InformationPiece]: + ) -> list[InformationPiece]: return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index faf9c4e..8b1c07e 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -41,7 +41,7 @@ def extractor_type(self) -> ExtractorTypes: async def aextract_content( self, - extraction_parameters: ExtractionParameters, + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -57,7 +57,9 @@ async def aextract_content( A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict - confluence_loader_parameters = {x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs} + confluence_loader_parameters = { + x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs + } # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 77e5435..85b92bd 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -36,7 +36,9 @@ class ConfluenceLangchainDocument2InformationPiece: USE_CASE_RELATED_KEY = "related" DOCUMENT_KEY = "document" - def map_document2informationpiece(self, document: LangchainDocument, document_name:str) -> InternalInformationPiece: + def map_document2informationpiece( + self, document: LangchainDocument, document_name: str + ) -> InternalInformationPiece: """ Map a LangchainDocument to an InformationPiece. @@ -54,19 +56,19 @@ def map_document2informationpiece(self, document: LangchainDocument, document_na ------ ValueError If Confluence parameters are not set before mapping documents. - """ + """ meta = self._map_meta(document.metadata, document_name) return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - def _map_meta(self, internal: dict, document_name:str) -> dict: + def _map_meta(self, internal: dict, document_name: str) -> dict: metadata = {} for key, value in internal.items(): - metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key]=value + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key] = value - page_title_matches = [v for k,v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] + page_title_matches = [v for k, v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" - metadata[self.USER_CASE_PAGE_KEY]=page_title - metadata[self.DOCUMENT_KEY]=document_name - metadata[self.USE_CASE_RELATED_KEY]=[] + metadata[self.USER_CASE_PAGE_KEY] = page_title + metadata[self.DOCUMENT_KEY] = document_name + metadata[self.USE_CASE_RELATED_KEY] = [] return metadata From 54f3c32e7a50291956bd9db81aab059a82f91e72 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:46:15 +0200 Subject: [PATCH 14/43] linting --- admin-api-lib/pyproject.toml | 1 - .../api_endpoints/source_uploader.py | 4 +- .../src/admin_api_lib/apis/admin_api.py | 15 +++--- .../src/admin_api_lib/apis/admin_api_base.py | 9 ++-- .../src/admin_api_lib/impl/admin_api.py | 5 +- .../api_endpoints/default_file_uploader.py | 15 ++---- .../api_endpoints/default_source_uploader.py | 52 ++++++------------- 7 files changed, 35 insertions(+), 66 deletions(-) diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index d7a995f..ec0de57 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -107,7 +107,6 @@ langfuse = "^2.60.4" redis = "^6.0.0" pyyaml = "^6.0.2" python-multipart = "^0.0.20" -requests-toolbelt = "^1.0.0" [tool.pytest.ini_options] log_cli = 1 diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 9cdd59e..3f9c15a 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,8 +1,6 @@ from abc import ABC, abstractmethod -from typing import Optional from pydantic import StrictStr -from fastapi import UploadFile from admin_api_lib.models.key_value_pair import KeyValuePair @@ -13,7 +11,7 @@ class SourceUploader(ABC): async def upload_source( self, base_url: str, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 4fe1e15..7f3eb1a 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -3,9 +3,7 @@ from typing import Dict, List # noqa: F401 import importlib import pkgutil - -from admin_api_lib.apis.admin_api_base import BaseAdminApi -from fastapi import APIRouter, Path, Request, Response, UploadFile, Form # noqa: F401 +from typing_extensions import Annotated import admin_api_lib.impl @@ -15,6 +13,8 @@ Cookie, Depends, Form, + UploadFile, + Request, Header, HTTPException, Path, @@ -23,15 +23,14 @@ Security, status, ) +from pydantic import Field, StrictStr -from admin_api_lib.models.extra_models import TokenModel # noqa: F401 -from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union -from typing_extensions import Annotated + +from admin_api_lib.apis.admin_api_base import BaseAdminApi from admin_api_lib.models.document_status import DocumentStatus from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair - +from admin_api_lib.models.extra_models import TokenModel # noqa: F401 router = APIRouter() diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index eb5ca84..ee1d0a4 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,14 +1,13 @@ # coding: utf-8 from typing import ClassVar, Dict, List, Tuple # noqa: F401 - -from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union from typing_extensions import Annotated + +from pydantic import Field, StrictStr +from fastapi import Request, Response, UploadFile + from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair -from fastapi import Request, Response, UploadFile class BaseAdminApi: diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index b05d7d7..d2e880a 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,10 +1,9 @@ """Module containing the implementation of the Admin API.""" import logging -from typing import List, Optional -from pydantic import Field, StrictBytes, StrictStr +from pydantic import StrictStr from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile @@ -93,7 +92,7 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - kwargs: List[KeyValuePair], + kwargs: list[KeyValuePair], request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 37a8e28..62b6448 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,21 +1,17 @@ from http.client import HTTPException import logging -import os from pathlib import Path import traceback -from typing import Optional, Tuple, Union from threading import Thread import urllib import tempfile -from urllib.request import Request +from contextlib import suppress - -from admin_api_lib.file_services.file_service import FileService -from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run +from admin_api_lib.file_services.file_service import FileService from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.api_endpoints.file_uploader import FileUploader from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi @@ -109,11 +105,10 @@ async def _handle_source_upload( self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents ] # Replace old document - try: + # deletion is allowed to fail + with suppress(Exception): await self._document_deleter.adelete_document(source_name) - except Exception as e: - # deletion is allowed to fail - pass + self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index ab1e153..f843fa4 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,15 +1,12 @@ from http.client import HTTPException import logging -import os -from pathlib import Path -from typing import Optional, Tuple, Union -from threading import Thread -import urllib -import tempfile -from pydantic import StrictBytes, StrictStr -from fastapi import UploadFile, status -from langchain_core.documents import Document from asyncio import run +from threading import Thread +from contextlib import suppress + +from pydantic import StrictStr +from fastapi import status + from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters @@ -51,18 +48,20 @@ def __init__( async def upload_source( self, base_url: str, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] - source_name = f"{type}:{sanitize_document_name(name)}" + source_name = f"{source_type}:{sanitize_document_name(name)}" try: # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status - thread = Thread(target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs))) + thread = Thread( + target=lambda: run(self._handle_source_upload(source_name, base_url, source_type, name, kwargs)) + ) thread.start() self._background_threads.append(thread) except ValueError as e: @@ -77,13 +76,13 @@ async def _handle_source_upload( self, source_name: str, base_url: str, - type: StrictStr, + source_type: StrictStr, name: str, kwargs: list[KeyValuePair], ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters(type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: @@ -99,32 +98,13 @@ async def _handle_source_upload( ] # Replace old document - try: + # deletion is allowed to fail + with suppress(Exception): await self._document_deleter.adelete_document(source_name) - except Exception as e: - # deletion is allowed to fail - pass + self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - - def _add_file_url( - self, type: StrictStr, file: Optional[UploadFile], base_url: str, chunked_documents: list[Document] - ): - if type != "file": - return - - document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" - for idx, chunk in enumerate(chunked_documents): - if chunk.metadata["id"] in chunk.metadata["related"]: - chunk.metadata["related"].remove(chunk.metadata["id"]) - chunk.metadata.update( - { - "chunk": idx, - "chunk_length": len(chunk.page_content), - "document_url": document_url, - } - ) From 0aa4d92d75bf5e7d88bb1202c1756ac77226ec18 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:59:48 +0200 Subject: [PATCH 15/43] wip --- admin-api-lib/openapi.yaml | 720 +++++++++--------- .../src/admin_api_lib/apis/admin_api.py | 2 +- .../src/admin_api_lib/apis/admin_api_base.py | 2 +- .../src/admin_api_lib/impl/admin_api.py | 2 +- extractor-api-lib/openapi.yaml | 292 +++---- .../api_endpoints/source_extractor.py | 5 - .../extractor_api_lib/apis/extractor_api.py | 1 - .../apis/extractor_api_base.py | 1 - .../extractors/information_extractor.py | 6 - .../extractors/information_file_extractor.py | 1 - .../api_endpoints/general_file_extractor.py | 5 - .../api_endpoints/general_source_extractor.py | 10 - .../impl/extractor_api_impl.py | 9 +- .../impl/extractors/confluence_extractor.py | 10 +- .../file_extractors/ms_docs_extractor.py | 2 - .../file_extractors/pdf_extractor.py | 1 - .../file_extractors/xml_extractor.py | 1 - ...ce_langchain_document2information_piece.py | 5 +- .../internal2external_information_piece.py | 6 +- 19 files changed, 525 insertions(+), 556 deletions(-) diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 86d433a..986f445 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,364 +1,378 @@ openapi: 3.1.0 info: - description: The API is used for the communication between the admin frontend - and the admin backend in the rag project. - title: admin-api-lib - version: 1.0.0 + title: admin-api-lib + version: 1.0.0 + description: >- + The API is used for the communication between the admin frontend and the admin backend in the + rag project. servers: -- url: /api + - + url: /api paths: - /delete_document/{identification}: - delete: - description: |- - Asynchronously deletes a document based on the provided identification. + '/delete_document/{identification}': + delete: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + schema: + title: Identification + description: '' + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Deleted + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + description: Internal server error + operationId: delete_document + summary: Delete Document + description: |- + Asynchronously deletes a document based on the provided identification. - Parameters - ---------- - identification : str - The unique identifier of the document to be deleted. + Parameters + ---------- + identification : str + The unique identifier of the document to be deleted. - Returns - ------- - None - operationId: delete_document - parameters: - - explode: false - in: path - name: identification - required: true - schema: - description: "" - title: Identification - type: string - style: simple - responses: - "200": - content: - application/json: - schema: {} - description: Deleted - "422": - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - "500": - description: Internal server error - summary: Delete Document - tags: - - admin - /document_reference/{identification}: - get: - description: |- - Asynchronously retrieve a document reference by its identification. + Returns + ------- + None + '/document_reference/{identification}': + get: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + description: Identifier of the document. + schema: + title: Identification + description: Identifier of the document. + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: + format: binary + title: Response 200 Document Reference Document Reference Identification Get + type: string + description: Returns the pdf in binary form. + '400': + content: + application/json: + schema: + title: Response 400 Document Reference Document Reference Identification Get + type: string + description: Bad request + '404': + content: + application/json: + schema: + title: Response 404 Document Reference Document Reference Identification Get + type: string + description: Document not found. + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + content: + application/json: + schema: + title: Response 500 Document Reference Document Reference Identification Get + type: string + description: Internal server error + operationId: document_reference + summary: Document Reference Id Get + description: |- + Asynchronously retrieve a document reference by its identification. - Parameters - ---------- - identification : str - The unique identifier for the document reference. + Parameters + ---------- + identification : str + The unique identifier for the document reference. - Returns - ------- - Response - The response object containing the document reference details. - operationId: document_reference - parameters: - - description: Identifier of the document. - explode: false - in: path - name: identification - required: true - schema: - description: Identifier of the document. - title: Identification - type: string - style: simple - responses: - "200": - content: - application/json: - schema: - format: binary - title: Response 200 Document Reference Document Reference Identification Get - type: string - description: Returns the pdf in binary form. - "400": - content: - application/json: - schema: - title: Response 400 Document Reference Document Reference Identification Get - type: string - description: Bad request - "404": - content: - application/json: - schema: - title: Response 404 Document Reference Document Reference Identification Get - type: string - description: Document not found. - "422": - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - "500": - content: - application/json: - schema: - title: Response 500 Document Reference Document Reference Identification Get - type: string - description: Internal server error - summary: Document Reference Id Get - tags: - - admin - /all_documents_status: - get: - description: |- - Asynchronously retrieves the status of all documents. + Returns + ------- + Response + The response object containing the document reference details. + /all_documents_status: + get: + tags: + - admin + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/DocumentStatus' + description: List of document links + '500': + description: Internal server error + operationId: get_all_documents_status + summary: Get All Documents Status + description: |- + Asynchronously retrieves the status of all documents. - Returns - ------- - list[DocumentStatus] - A list containing the status of all documents. - operationId: get_all_documents_status - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/DocumentStatus' - type: array - description: List of document links - "500": - description: Internal server error - summary: Get All Documents Status - tags: - - admin - /upload_file: - post: - description: Uploads user selected sources. - operationId: upload_file - requestBody: - content: - multipart/form-data: - schema: - $ref: '#/components/schemas/Body_upload_file_upload_file_post' - required: true - responses: - "200": - content: - application/json: - schema: {} - description: ok - "400": - description: Bad request - "422": - description: Unprocessable Content - "500": - description: Internal server error - summary: Upload File - tags: - - admin - /upload_source: - post: - description: Uploads user selected sources. - operationId: upload_source - parameters: - - explode: true - in: query - name: type - required: false - schema: - description: "" - title: Type - type: string - style: form - - explode: true - in: query - name: name - required: false - schema: - description: "" - title: Name - type: string - style: form - requestBody: - content: - application/json: - schema: - description: "" - items: - $ref: '#/components/schemas/KeyValuePair' - type: array - responses: - "200": - content: - application/json: - schema: {} - description: ok - "400": - description: Bad request - "422": - description: Unprocessable Content - "500": - description: Internal server error - summary: Upload Source - tags: - - admin + Returns + ------- + list[DocumentStatus] + A list containing the status of all documents. + /upload_file: + post: + requestBody: + content: + multipart/form-data: + schema: + $ref: '#/components/schemas/Body_upload_file_upload_file_post' + required: true + tags: + - admin + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_file + summary: Upload File + description: Uploads user selected sources. + /upload_source: + post: + requestBody: + content: + application/json: + schema: + description: '' + type: array + items: + $ref: '#/components/schemas/KeyValuePair' + tags: + - admin + parameters: + - + style: form + explode: true + name: source_type + schema: + title: Type + description: '' + type: string + in: query + required: false + - + style: form + explode: true + name: name + schema: + title: Name + description: '' + type: string + in: query + required: false + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_source + summary: Upload Source + description: Uploads user selected sources. components: - schemas: - Body_upload_file_upload_file_post: - properties: - file: - format: binary - title: File - type: string - required: - - file - title: Body_upload_file_upload_file_post - DocumentStatus: - description: DocumentStatus - example: - name: name - status: UPLOADING - properties: - name: - title: Name - type: string - status: - $ref: '#/components/schemas/Status' - required: - - name - - status - title: DocumentStatus - HTTPValidationError: - description: HTTPValidationError - example: - detail: - - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - properties: - detail: - items: - $ref: '#/components/schemas/ValidationError' - nullable: true - title: detail - type: array - title: HTTPValidationError - KeyValuePair: - description: KeyValuePair - example: - value: value - key: key - properties: - key: - title: Key - type: string - value: - title: Value - type: string - required: - - key - - value - title: KeyValuePair - Status: - description: allowed enum values - enum: - - UPLOADING - - PROCESSING - - READY - - ERROR - title: Status - type: string - ValidationError: - description: ValidationError - example: - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - properties: - loc: - items: - $ref: '#/components/schemas/ValidationErrorLocInner' - title: loc - type: array - msg: - title: Msg - type: string - type: - title: Type - type: string - required: - - loc - - msg - - type - title: ValidationError - ValidationErrorLocInner: - description: ValidationErrorLocInner - example: - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - properties: - anyof_schema_1_validator: - nullable: true - title: anyof_schema_1_validator - type: string - anyof_schema_2_validator: - nullable: true - title: anyof_schema_2_validator - type: integer - actual_instance: - title: actual_instance - any_of_schemas: - items: + schemas: + Body_upload_file_upload_file_post: + title: Body_upload_file_upload_file_post + required: + - file + properties: + file: + format: binary + title: File + type: string + DocumentStatus: + title: DocumentStatus + description: DocumentStatus + required: + - name + - status + properties: + name: + title: Name + type: string + status: + $ref: '#/components/schemas/Status' + example: + name: name + status: UPLOADING + HTTPValidationError: + title: HTTPValidationError + description: HTTPValidationError + properties: + detail: + nullable: true + title: detail + type: array + items: + $ref: '#/components/schemas/ValidationError' + example: + detail: + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + KeyValuePair: + title: KeyValuePair + description: KeyValuePair + required: + - key + - value + properties: + key: + title: Key + type: string + value: + title: Value + type: string + example: + value: value + key: key + Status: + title: Status + description: allowed enum values + enum: + - UPLOADING + - PROCESSING + - READY + - ERROR type: string - title: any_of_schemas - type: array - title: ValidationErrorLocInner + ValidationError: + title: ValidationError + description: ValidationError + required: + - loc + - msg + - type + properties: + loc: + title: loc + type: array + items: + $ref: '#/components/schemas/ValidationErrorLocInner' + msg: + title: Msg + type: string + type: + title: Type + type: string + example: + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + ValidationErrorLocInner: + title: ValidationErrorLocInner + description: ValidationErrorLocInner + properties: + anyof_schema_1_validator: + nullable: true + title: anyof_schema_1_validator + type: string + anyof_schema_2_validator: + nullable: true + title: anyof_schema_2_validator + type: integer + actual_instance: + title: actual_instance + any_of_schemas: + title: any_of_schemas + type: array + items: + type: string + example: + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 7f3eb1a..ec95b92 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -166,7 +166,7 @@ async def upload_file( ) async def upload_source( request: Request, - type: StrictStr = Query(None, description="", alias="type"), + source_type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index ee1d0a4..e184692 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -66,7 +66,7 @@ async def get_all_documents_status( async def upload_source( self, - type: StrictStr, + source_type: StrictStr, name: StrictStr, key_value_pair: List[KeyValuePair], request: Request, diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index d2e880a..04cd6df 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -90,7 +90,7 @@ async def get_all_documents_status( @inject async def upload_source( self, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], request: Request, diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index d178a86..205d208 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -1,149 +1,153 @@ openapi: 3.0.2 info: - title: extractor-api-lib - version: 1.0.0 + title: extractor-api-lib + version: 1.0.0 servers: -- url: / + - + url: / paths: - /extract_from_file: - post: - operationId: extract_from_file_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/extraction_request' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: List of extracted information. - "422": - description: Body is not a valid PDF. - "500": - description: Something somewhere went terribly wrong. - tags: - - extractor - /extract_from_source: - post: - operationId: extract_from_source - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/extraction_parameters' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: ok - "404": - description: not found - "422": - description: unprocessable entity - "500": - description: internal server error - tags: - - extractor + /extract_from_file: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_request' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: List of extracted information. + '422': + description: Body is not a valid PDF. + '500': + description: Something somewhere went terribly wrong. + operationId: extract_from_file_post + /extract_from_source: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_parameters' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: ok + '404': + description: not found + '422': + description: unprocessable entity + '500': + description: internal server error + operationId: extract_from_source components: - schemas: - extraction_request: - description: "" - example: - path_on_s3: path on s3 - properties: - path_on_s3: - description: "" - title: PathOnS3 - type: string - document_name: - description: "" - type: string - required: - - document_name - - path_on_s3 - title: ExtractionRequest - type: object - key_value_pair: - description: "" - example: - value: value - key: key - properties: - key: - description: "" - title: Key - value: - description: "" - title: Value - title: MetaInformationPiece - type: object - content_type: - description: "" - enum: - - IMAGE - - TABLE - - TEXT - title: InformationType - type: string - information_piece: - description: A piece of information that has been extracted. - example: - metadata: - - key: key - value: value - - key: key - value: value - page_content: some text - type: TEXT - properties: - metadata: - description: "" - items: - $ref: '#/components/schemas/key_value_pair' - title: MetaInformation - type: array - page_content: - description: "" - type: string - type: - $ref: '#/components/schemas/content_type' - required: - - metadata - - page_content - - type - title: InformationPiece - type: object - extraction_parameters: - description: "" - properties: - document_name: - description: The name that will be used to store the confluence db in the - key value db and the vectordatabase (metadata.document). - title: document_name - type: string - type: - description: Extractortype - title: type - type: string - kwargs: - description: Kwargs for the extractor - items: - $ref: '#/components/schemas/key_value_pair' - title: confluence_kwargs - type: array - required: - - document_name - - type - title: confluence_parameters - type: object + schemas: + extraction_request: + title: ExtractionRequest + description: '' + required: + - document_name + - path_on_s3 + type: object + properties: + path_on_s3: + title: PathOnS3 + description: '' + type: string + document_name: + description: '' + type: string + example: + path_on_s3: path on s3 + key_value_pair: + title: MetaInformationPiece + description: '' + type: object + properties: + key: + title: Key + description: '' + value: + title: Value + description: '' + example: + value: value + key: key + content_type: + title: InformationType + description: '' + enum: + - IMAGE + - TABLE + - TEXT + type: string + information_piece: + title: InformationPiece + description: A piece of information that has been extracted. + required: + - metadata + - page_content + - type + type: object + properties: + metadata: + title: MetaInformation + description: '' + type: array + items: + $ref: '#/components/schemas/key_value_pair' + page_content: + description: '' + type: string + type: + $ref: '#/components/schemas/content_type' + example: + metadata: + - + key: key + value: value + - + key: key + value: value + page_content: some text + type: TEXT + extraction_parameters: + title: confluence_parameters + description: '' + required: + - document_name + - source_type + type: object + properties: + document_name: + title: document_name + description: >- + The name that will be used to store the confluence db in the key value db and the + vectordatabase (metadata.document). + type: string + kwargs: + title: confluence_kwargs + description: Kwargs for the extractor + type: array + items: + $ref: '#/components/schemas/key_value_pair' + source_type: + title: type + description: Extractortype + type: string diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index 44b5c38..d656367 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -1,12 +1,7 @@ from abc import ABC, abstractmethod -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr -from fastapi import UploadFile - from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair class SourceExtractor(ABC): diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index fc3d0ee..7d09897 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -23,7 +23,6 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from typing import Any, List from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index b1bac98..696c60c 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,7 +2,6 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from typing import Any, List from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index 35952cf..3a6ee68 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -1,16 +1,10 @@ """Module for the Base class for Information extractors.""" from abc import ABC, abstractmethod -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from fastapi import UploadFile -from pydantic import StrictStr - from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index e9602d4..7897c19 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -3,7 +3,6 @@ from abc import ABC, abstractmethod from pathlib import Path -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.file_services.file_service import FileService diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 8ed9e8b..fee7db2 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -4,7 +4,6 @@ from pathlib import Path import tempfile import traceback -from typing import Any, List, Optional from extractor_api_lib.api_endpoints.file_extractor import FileExtractor @@ -12,11 +11,7 @@ from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor -from extractor_api_lib.extractors.information_extractor import InformationExtractor -from extractor_api_lib.impl.types.extractor_types import ExtractorTypes from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 7e135b6..0c5dbe4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -1,22 +1,12 @@ """Module for the DefaultFileExtractor class.""" import logging -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr -from fastapi import UploadFile - from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor -from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index 50a8623..276f720 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,20 +1,15 @@ """Module for the implementation of the ExtractorApi interface.""" +from fastapi import Depends from dependency_injector.wiring import Provide, inject + from extractor_api_lib.api_endpoints.file_extractor import FileExtractor from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest -from fastapi import Depends, UploadFile - -from pydantic import StrictStr -from typing import Optional from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair - from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi from extractor_api_lib.dependency_container import DependencyContainer -from extractor_api_lib.models.information_piece import InformationPiece class ExtractorApiImpl(BaseExtractorApi): diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 8b1c07e..3cb55f4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -1,16 +1,10 @@ """Module for the DefaultConfluenceExtractor class.""" -from typing import Optional - -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece -from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader -from fastapi import UploadFile from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index c67425d..5201c62 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -6,14 +6,12 @@ from typing import Any, Optional import pandas as pd - from unstructured.documents.elements import Element from unstructured.partition.docx import partition_docx from unstructured.partition.pptx import partition_pptx from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index 8d5bd35..928998f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -21,7 +21,6 @@ from extractor_api_lib.impl.utils.utils import hash_datetime from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index e7523b6..d72292a 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -11,7 +11,6 @@ from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 85b92bd..a7bcb0d 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -1,12 +1,9 @@ """Module for the ConfluenceLangchainDocument2InformationPiece class.""" -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from langchain_core.documents import Document as LangchainDocument -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.content_type import ContentType -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece class ConfluenceLangchainDocument2InformationPiece: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index 11f57b4..ee611cb 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -5,9 +5,7 @@ from extractor_api_lib.models.dataclasses.internal_information_piece import ( InternalInformationPiece as InternalInformationPiece, ) -from extractor_api_lib.models.information_piece import ( - InformationPiece as ExternalInformationPiece, -) +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece @@ -27,7 +25,7 @@ class Internal2ExternalInformationPiece: InternalContentType.TABLE: ExternalContentType.TABLE, } - def map_internal_to_external(self, internal: InternalInformationPiece) -> ExternalInformationPiece: + def map_internal_to_external(self, internal: InternalInformationPiece) -> InformationPiece: """Map an InternalInformationPiece object to an ExternalInformationPiece object. Parameters From 9f99eebfce0525387f41b2899edbf8411e2364e4 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 16:01:09 +0200 Subject: [PATCH 16/43] name change --- .../openapi_client/models/extraction_parameters.py | 6 +++--- .../src/extractor_api_lib/models/extraction_parameters.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py index 37db1e8..13ba2ea 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -30,9 +30,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - type: StrictStr = Field(description="Extractortype") kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") - __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] model_config = ConfigDict( populate_by_name=True, @@ -92,12 +92,12 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None ), + "source_type": obj.get("source_type"), } ) return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index d701978..e18a452 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -34,9 +34,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - type: StrictStr = Field(description="Extractortype") kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") - __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] model_config = { "populate_by_name": True, @@ -94,12 +94,12 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None ), + "source_type": obj.get("source_type"), } ) return _obj From 82d27d11d75287d2dc1230e92564011d4f308ee2 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Mon, 19 May 2025 07:51:58 +0200 Subject: [PATCH 17/43] lint --- .../impl/mapper/internal2external_information_piece.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index ee611cb..6c4d6b8 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -2,9 +2,7 @@ from extractor_api_lib.impl.types.content_type import ContentType as InternalContentType from extractor_api_lib.models.content_type import ContentType as ExternalContentType -from extractor_api_lib.models.dataclasses.internal_information_piece import ( - InternalInformationPiece as InternalInformationPiece, -) +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece @@ -40,7 +38,7 @@ def map_internal_to_external(self, internal: InternalInformationPiece) -> Inform """ information_type = self._map_information_type(internal.type) meta = self._map_meta(internal.metadata) - return ExternalInformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) + return InformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) def _map_information_type(self, internal: InternalContentType) -> ExternalContentType: return self.TYPE_LOOKUP_TABLE[internal] From c752478055a1dbb19f3d72d5011c4019033b1922 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Mon, 19 May 2025 08:02:46 +0200 Subject: [PATCH 18/43] reset poetry.lock --- admin-api-lib/poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/admin-api-lib/poetry.lock b/admin-api-lib/poetry.lock index bd12f09..223c2a5 100644 --- a/admin-api-lib/poetry.lock +++ b/admin-api-lib/poetry.lock @@ -3693,4 +3693,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "f34effb5fa2b12b05da69ca28c62764dc2017a2a2a9336b5265428005004e7ec" +content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" \ No newline at end of file From ee8f3c723f291f984d359c706873424fe6e72a01 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 08:05:41 +0200 Subject: [PATCH 19/43] fix tests --- admin-api-lib/tests/settings/__init__.py | 0 .../settings/confluence_settings_test.py | 108 ------------------ rag-core-api/src/rag_core_api/apis/rag_api.py | 30 +---- .../src/rag_core_api/apis/rag_api_base.py | 8 +- .../src/rag_core_api/models/chat_history.py | 10 +- .../models/chat_history_message.py | 10 +- .../src/rag_core_api/models/chat_request.py | 12 +- .../src/rag_core_api/models/chat_response.py | 12 +- .../src/rag_core_api/models/chat_role.py | 2 +- .../src/rag_core_api/models/content_type.py | 2 +- .../src/rag_core_api/models/delete_request.py | 10 +- .../rag_core_api/models/information_piece.py | 10 +- .../src/rag_core_api/models/key_value_pair.py | 9 +- 13 files changed, 47 insertions(+), 176 deletions(-) delete mode 100644 admin-api-lib/tests/settings/__init__.py delete mode 100644 admin-api-lib/tests/settings/confluence_settings_test.py diff --git a/admin-api-lib/tests/settings/__init__.py b/admin-api-lib/tests/settings/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/tests/settings/confluence_settings_test.py b/admin-api-lib/tests/settings/confluence_settings_test.py deleted file mode 100644 index a98fe7b..0000000 --- a/admin-api-lib/tests/settings/confluence_settings_test.py +++ /dev/null @@ -1,108 +0,0 @@ -import pytest -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_default_values(): - # When no settings are provided, all lists default to empty lists. - settings = ConfluenceSettings() - assert settings.url == CommaSeparatedStrList() - assert settings.token == CommaSeparatedStrList() - assert settings.space_key == CommaSeparatedStrList() - assert settings.document_name == CommaSeparatedStrList() - # Bool lists are empty by default if no url is provided. - assert settings.verify_ssl == CommaSeparatedBoolList() - assert settings.include_attachments == CommaSeparatedBoolList() - assert settings.keep_markdown_format == CommaSeparatedBoolList() - assert settings.keep_newlines == CommaSeparatedBoolList() - - -def test_valid_initialization_matching_lengths(): - # Provide all settings with matching lengths. - urls = "http://confluence1, http://confluence2" - tokens = "token1, token2" - space_keys = "SPACE1, SPACE2" - document_names = "Doc1, Doc2" - verify_ssl = "True, False" - include_attachments = "False, True" - keep_markdown_format = "True, True" - keep_newlines = "False, False" - - settings = ConfluenceSettings( - url=urls, - token=tokens, - space_key=space_keys, - document_name=document_names, - verify_ssl=verify_ssl, - include_attachments=include_attachments, - keep_markdown_format=keep_markdown_format, - keep_newlines=keep_newlines, - ) - - # Verify that the comma separated lists have been properly parsed. - assert settings.url == CommaSeparatedStrList(["http://confluence1", "http://confluence2"]) - assert settings.token == CommaSeparatedStrList(["token1", "token2"]) - assert settings.space_key == CommaSeparatedStrList(["SPACE1", "SPACE2"]) - assert settings.document_name == CommaSeparatedStrList(["Doc1", "Doc2"]) - assert settings.verify_ssl == CommaSeparatedBoolList([True, False]) - assert settings.include_attachments == CommaSeparatedBoolList([False, True]) - assert settings.keep_markdown_format == CommaSeparatedBoolList([True, True]) - assert settings.keep_newlines == CommaSeparatedBoolList([False, False]) - - -def test_mismatched_list_lengths(): - # Provide mismatched lengths for comma separated fields, should raise ValueError. - urls = "http://confluence1, http://confluence2, http://confluence3" - tokens = "token1, token2" # shorter than url list - with pytest.raises(ValueError): - ConfluenceSettings( - url=urls, - token=tokens, - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - - -def test_default_bool_values_when_missing(): - # Provide only url and leave bool fields empty to see if they are set to defaults. - urls = "http://confluence1, http://confluence2, http://confluence3" - settings = ConfluenceSettings( - url=urls, - token="token1, token2, token3", - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - # Defaults for bool fields: verify_ssl True, include_attachments False, - # keep_markdown_format True, keep_newlines True, for each entry. - expected_verify_ssl = CommaSeparatedBoolList([True, True, True]) - expected_include_attachments = CommaSeparatedBoolList([False, False, False]) - expected_keep_markdown_format = CommaSeparatedBoolList([True, True, True]) - expected_keep_newlines = CommaSeparatedBoolList([True, True, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines - - -def test_bool_fields_not_overwritten_when_provided(): - # Provide bool fields explicitly; they should not be overwritten by defaults. - urls = "http://confluence1, http://confluence2" - settings = ConfluenceSettings( - url=urls, - token="token1, token2", - space_key="SPACE1, SPACE2", - document_name="Doc1, Doc2", - verify_ssl="False, False", - include_attachments="True, True", - keep_markdown_format="False, False", - keep_newlines="False, True", - ) - expected_verify_ssl = CommaSeparatedBoolList([False, False]) - expected_include_attachments = CommaSeparatedBoolList([True, True]) - expected_keep_markdown_format = CommaSeparatedBoolList([False, False]) - expected_keep_newlines = CommaSeparatedBoolList([False, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index fb432c6..dda92db 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,16 +3,12 @@ # coding: utf-8 # flake8: noqa: D105 -from asyncio import FIRST_COMPLETED, CancelledError, create_task, wait -from contextlib import suppress -import logging -from time import sleep -from typing import Dict, List # noqa: F401 import importlib +import logging import pkgutil - -from rag_core_api.apis.rag_api_base import BaseRagApi -import openapi_server.impl +from asyncio import FIRST_COMPLETED, CancelledError, create_task, sleep, wait +from contextlib import suppress +from typing import Any, Awaitable, List # noqa: F401 from fastapi import ( # noqa: F401 APIRouter, @@ -33,17 +29,11 @@ import rag_core_api.impl from rag_core_api.apis.rag_api_base import BaseRagApi -from rag_core_api.models.extra_models import TokenModel # noqa: F401 -from pydantic import Field, StrictStr -from typing import Any, List -import logging -from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest from rag_core_api.models.information_piece import InformationPiece - logger = logging.getLogger(__name__) router = APIRouter() @@ -74,10 +64,8 @@ async def _disconnected(request: Request) -> None: ) async def chat( request: Request, - session_id: StrictStr = Path(..., description=""), - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body( - None, description="Chat with RAG." - ), + session_id: str = Path(..., description=""), + chat_request: ChatRequest = Body(None, description="Chat with RAG."), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -141,8 +129,6 @@ async def evaluate() -> None: ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().evaluate() @@ -175,8 +161,6 @@ async def remove_information_piece( ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -208,6 +192,4 @@ async def upload_information_piece( ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 0b53f4b..615230d 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -2,11 +2,9 @@ # coding: utf-8 # flake8: noqa: D105 + from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from pydantic import Field, StrictStr -from typing import Any, List -from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest @@ -33,8 +31,8 @@ def __init_subclass__(cls, **kwargs): async def chat( self, - session_id: StrictStr, - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")], + session_id: str, + chat_request: ChatRequest, ) -> ChatResponse: """ Asynchronously handles the chat endpoint for the RAG API. diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 9087afe..5980dca 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.chat_history_message import ChatHistoryMessage try: @@ -46,8 +47,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index c9d782b..c664092 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.chat_role import ChatRole try: @@ -47,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 66090ef..1e0b135 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List, Optional from pydantic import BaseModel, ConfigDict, StrictStr -from typing import Any, ClassVar, Dict, List, Optional + from rag_core_api.models.chat_history import ChatHistory try: @@ -47,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: @@ -86,7 +86,7 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { - "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "history": (ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None), "message": obj.get("message"), } ) diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index ba8c6b1..a0fcf44 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.information_piece import InformationPiece try: @@ -48,8 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: @@ -94,7 +94,7 @@ def from_dict(cls, obj: Dict) -> Self: "answer": obj.get("answer"), "finish_reason": obj.get("finish_reason"), "citations": ( - [InformationPiece.from_dict(_item) for _item in obj.get("citations")] + [SourceDocument.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None ), diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index 7e1c88d..cd2ff17 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -13,12 +13,12 @@ from __future__ import annotations + import json import pprint import re # noqa: F401 from enum import Enum - try: from typing import Self except ImportError: diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index 7f4d874..3d39928 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -13,12 +13,12 @@ from __future__ import annotations + import json import pprint import re # noqa: F401 from enum import Enum - try: from typing import Self except ImportError: diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 8b40339..797dcf2 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List, Optional from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List, Optional + from rag_core_api.models.key_value_pair import KeyValuePair try: @@ -46,8 +47,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index dfe8a42..b85092f 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair @@ -53,8 +54,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index 3079959..abf0986 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -13,13 +13,13 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -48,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: From ef8dd202c5f94ffe1cfdfc7bdd9272b9d2dd2df9 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:07:35 +0200 Subject: [PATCH 20/43] update doc for admin api --- README.md | 15 ++++---- .../api_endpoints/file_uploader.py | 17 ++++++++- .../api_endpoints/source_uploader.py | 23 ++++++++++- .../api_endpoints/default_file_uploader.py | 38 ++++++++++++++++++- .../api_endpoints/default_source_uploader.py | 38 +++++++++++++++++++ 5 files changed, 120 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 13d9231..159833c 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,8 @@ The following endpoints are provided by the *admin-api-lib*: - `/delete_document/{identification}`: Deletes the file from storage (if applicable) and vector database. The `identification` can be retrieved from the `/all_documents_status` endpoint. - `/document_reference/{identification}`: Returns the document. - `/all_documents_status`: Return the `identification` and status of all available sources. -- `/upload_documents`: Endpoint to upload files. -- `/load_confluence`: Endpoint to load a confluence space +- `/upload_file`: Endpoint to upload files. +- `/upload_source`: Endpoint to upload non-file sources. ### 2.1 Requirements @@ -135,14 +135,15 @@ Will return the source document stored in the connected storage system. Will return a list of all sources for the chat and their current status. -#### `/upload_documents` +#### `/upload_file` Files can be uploaded here. This endpoint will process the document in a background and will extract information using the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using a LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). -#### `/load_confluence` +#### `/upload_source` -Loads all the content of a confluence space using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). ### 2.3 Replaceable parts @@ -162,9 +163,9 @@ The extracted information will be summarized using LLM. The summary, as well as | information_enhancer | [`rag_core_lib.chains.async_chain.AsyncChain[Any, Any]`](./rag-core-lib/src/rag_core_lib/chains/async_chain.py)| [`rag_core_lib.impl.tracers.langfuse_traced_chain.LangfuseTracedGraph`](./rag-core-lib/src/rag_core_lib/impl/tracers/langfuse_traced_chain.py) |Wraps around the *untraced_information_enhancer* and adds langfuse tracing. | | document_deleter |[`admin_api_lib.api_endpoints.document_deleter.DocumentDeleter`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py) | [`admin_api_lib.impl.api_endpoints.default_document_deleter.DefaultDocumentDeleter`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py) | Handles deletion of sources. | | documents_status_retriever | [`admin_api_lib.api_endpoints.documents_status_retriever.DocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/documents_status_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_documents_status_retriever.DefaultDocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py) |Handles return of source status. | -| confluence_loader | [`admin_api_lib.api_endpoints.confluence_loader.ConfluenceLoader`](./admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py) | [`admin_api_lib.impl.api_endpoints.default_confluence_loader.DefaultConfluenceLoader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py)| Handles data loading and extraction from confluence. | +| source_uploader | [`admin_api_lib.api_endpoints.source_uploader.SourceUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_source_uploader.DefaultSourceUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py)| Handles data loading and extraction from various non-file sources. | | document_reference_retriever | [`admin_api_lib.api_endpoints.document_reference_retriever.DocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_reference_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_document_reference_retriever.DefaultDocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py) | Handles return of files from connected storage. | -| document_uploader | [`admin_api_lib.api_endpoints.document_uploader.DocumentUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_document_uploader.DefaultDocumentUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py) | Handles upload and extraction of files. | +| file_uploader | [`admin_api_lib.api_endpoints.file_uploader.FileUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_file_uploader.DefaultFileUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py) | Handles upload and extraction of files. | ## 3. Extractor API Lib diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 2a33545..3dad40c 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,3 +1,4 @@ +"""Module for the upload file endpoint.""" from abc import ABC, abstractmethod from fastapi import UploadFile @@ -10,4 +11,18 @@ async def upload_file( self, base_url: str, file: UploadFile, - ) -> None: ... + ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ \ No newline at end of file diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 3f9c15a..f135b54 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,3 +1,4 @@ +"""Module for the upload source endpoint.""" from abc import ABC, abstractmethod from pydantic import StrictStr @@ -6,7 +7,7 @@ class SourceUploader(ABC): - + """Abstract base class for source upload.""" @abstractmethod async def upload_source( self, @@ -14,4 +15,22 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - ) -> None: ... + ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the source. + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + + Returns + ------- + None + """ \ No newline at end of file diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 62b6448..80db150 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -28,7 +28,7 @@ class DefaultFileUploader(FileUploader): - + """The DefaultFileUploader is responsible for adding a new source file document to the available content.""" def __init__( self, extractor_api: ExtractorApi, @@ -40,6 +40,28 @@ def __init__( information_mapper: InformationPiece2Document, file_service: FileService, ): + """ + Initialize the DefaultFileUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + file_service : FileService + The service for handling file operations on the S3 storage + """ self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -55,6 +77,20 @@ async def upload_file( base_url: str, file: UploadFile, ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ self._background_threads = [t for t in self._background_threads if t.is_alive()] try: diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index f843fa4..db9fe6c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -36,6 +36,26 @@ def __init__( rag_api: RagApi, information_mapper: InformationPiece2Document, ): + """ + Initialize the DefaultSourceUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + """ self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -52,6 +72,24 @@ async def upload_source( name: StrictStr, kwargs: list[KeyValuePair], ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the source. + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + + Returns + ------- + None + """ self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: From a86f76c2b9851cc92c54574d6b6d5ce44c4e4598 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:08:04 +0200 Subject: [PATCH 21/43] black --- .../src/admin_api_lib/api_endpoints/file_uploader.py | 5 +++-- .../src/admin_api_lib/api_endpoints/source_uploader.py | 6 ++++-- .../impl/api_endpoints/default_file_uploader.py | 1 + .../impl/api_endpoints/default_source_uploader.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 3dad40c..b8594c7 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,4 +1,5 @@ """Module for the upload file endpoint.""" + from abc import ABC, abstractmethod from fastapi import UploadFile @@ -11,7 +12,7 @@ async def upload_file( self, base_url: str, file: UploadFile, - ) -> None: + ) -> None: """ Uploads a source file for content extraction. @@ -25,4 +26,4 @@ async def upload_file( Returns ------- None - """ \ No newline at end of file + """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index f135b54..f4b4e03 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,4 +1,5 @@ """Module for the upload source endpoint.""" + from abc import ABC, abstractmethod from pydantic import StrictStr @@ -8,6 +9,7 @@ class SourceUploader(ABC): """Abstract base class for source upload.""" + @abstractmethod async def upload_source( self, @@ -15,7 +17,7 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - ) -> None: + ) -> None: """ Uploads the parameters for source content extraction. @@ -33,4 +35,4 @@ async def upload_source( Returns ------- None - """ \ No newline at end of file + """ diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 80db150..b9b367f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -29,6 +29,7 @@ class DefaultFileUploader(FileUploader): """The DefaultFileUploader is responsible for adding a new source file document to the available content.""" + def __init__( self, extractor_api: ExtractorApi, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index db9fe6c..4fc7ff3 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -54,7 +54,7 @@ def __init__( rag_api : RagApi The API for RAG backend. information_mapper : InformationPiece2Document - The mapper for converting information pieces to langchain documents. + The mapper for converting information pieces to langchain documents. """ self._extractor_api = extractor_api self._rag_api = rag_api From acde7e53e541fce52a524e344cab260b45c27284 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:29:26 +0200 Subject: [PATCH 22/43] extractor comments --- README.md | 13 ++++++++----- .../api_endpoints/file_extractor.py | 2 +- .../api_endpoints/source_extractor.py | 1 + .../impl/api_endpoints/general_source_extractor.py | 10 ++++++---- .../impl/extractors/confluence_extractor.py | 6 +++--- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 159833c..7becbcd 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ This API should not be exposed by ingress and only used for internally. The following endpoints are provided by the *extractor-api-lib*: - `/extract_from_file`: This endpoint extracts the information from files. -- `/extract_from_confluence`: This endpoint extracts the information from a confluence space. +- `/extract_from_source`: This endpoint extracts the information from a non-file source. ### 3.1 Requirements @@ -203,12 +203,14 @@ The following types of information will be extracted: - `TEXT`: plain text - `TABLE`: data in tabular form found in the document -#### `/extract_from_confluence` +#### `/extract_from_source` -The extract from confluence endpoint will extract the information from a confluence space. -The following types of information will be extracted: +This endpoint will extract data for non-file source. +The type of information that is extracted will vary depending on the source, the following types of information can be extracted: - `TEXT`: plain text +- `TABLE`: data in tabular form found in the document +- `IMAGE`: data in tabular form found in the document ### 3.3 Replaceable parts @@ -222,7 +224,8 @@ The following types of information will be extracted: | all_extractors | `dependency_injector.providers.List[extractor_api_lib.document_parser.information_extractor.InformationExtractor]` | `dependency_injector.providers.List(pdf_extractor, ms_docs_extractor, xml_extractor)` | List of all available extractors. If you add a new type of extractor you would have to add it to this list. | | general_extractor | [`extractor_api_lib.document_parser.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py) |[`extractor_api_lib.document_parser.general_extractor.GeneralExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/general_extractor.py) | Combines multiple extractors and decides which one to use for the given file format. | | file_extractor | [`extractor_api_lib.api_endpoints.file_extractor.FileExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_file_extractor.DefaultFileExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py) | Implementation of the `/extract_from_file` endpoint. Uses *general_extractor*. | -| confluence_extractor | [`extractor_api_lib.api_endpoints.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_confluence_extractor.DefaultConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py) | Implementation of the `/extract_from_confluence` endpoint. | +| general_source_extractor | [`extractor_api_lib.api_endpoints.source_extractor.SourceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py) | [`extractor_api_lib.impl.api_endpoints.general_source_extractor.GeneralSourceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py) | Implementation of the `/extract_from_source` endpoint. Will decide the correct extractor for the source. | +| confluence_extractor | [`extractor_api_lib.extractors.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py) | [`extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/confluence_extractor.py) | Implementation of an esxtractor for the source `confluence`. | ## 4. RAG Core Lib diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py index ad968a2..2c9a645 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -4,7 +4,7 @@ class FileExtractor(ABC): - """Abstract base class for extract_information endpoint.""" + """Abstract base class for extract__from_file endpoint.""" @abstractmethod async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index d656367..4071322 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -5,6 +5,7 @@ class SourceExtractor(ABC): + """Abstract base class for extract_from_source endpoint.""" @abstractmethod async def aextract_information( diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 0c5dbe4..70bfab8 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -28,6 +28,8 @@ def __init__(self, available_extractors: list[InformationExtractor], mapper: Int ---------- available_extractors : list of InformationExtractor A list of available information extractors to be used by the GeneralExtractor. + mapper : Internal2ExternalInformationPiece + Mapper for mapping the internal represantation to the external one. """ self._mapper = mapper self._available_extractors = available_extractors @@ -37,17 +39,17 @@ async def aextract_information( extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ - Extract content from given file. + Extract information from source, using the given parameters. Parameters ---------- - file_path : Path - Path to the file the information should be extracted from. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- list[InformationPiece] - The extracted information. + A list of extracted information pieces. """ correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] if not correct_extractors: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 3cb55f4..f1c15a6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -12,14 +12,14 @@ class ConfluenceExtractor(InformationExtractor): - """Default implementation of the FileExtractor interface.""" + """Implementation of the InformationExtractor interface for confluence.""" def __init__( self, mapper: ConfluenceLangchainDocument2InformationPiece, ): """ - Initialize the DefaultConfluenceExtractor. + Initialize the ConfluenceExtractor. Parameters ---------- @@ -42,7 +42,7 @@ async def aextract_content( Parameters ---------- - confluence_parameters : ConfluenceParameters + extraction_parameters : ExtractionParameters The parameters required to connect to and extract data from Confluence. Returns From 5e6ca9ba2911e6d21150f9b0b8164f8699749747 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 10:13:16 +0200 Subject: [PATCH 23/43] fix: minor bugs --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- admin-api-lib/src/admin_api_lib/impl/admin_api.py | 2 +- .../impl/api_endpoints/default_source_uploader.py | 2 +- .../impl/api_endpoints/general_source_extractor.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index ec95b92..fc8d867 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -173,4 +173,4 @@ async def upload_source( """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair, request) + return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair, request) diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 04cd6df..fbc62eb 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -96,7 +96,7 @@ async def upload_source( request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - await source_uploader.upload_source(str(request.base_url), type, name, kwargs) + await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) @inject async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 4fc7ff3..2770553 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -120,7 +120,7 @@ async def _handle_source_upload( ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters(source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 70bfab8..8e08ad6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -51,8 +51,8 @@ async def aextract_information( list[InformationPiece] A list of extracted information pieces. """ - correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] + correct_extractors = [x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type] if not correct_extractors: - raise ValueError(f"No extractor found for type {type}") + raise ValueError(f"No extractor found for type {extraction_parameters.source_type}") results = await correct_extractors[-1].aextract_content(extraction_parameters) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] From c5c537b8a2be2a1f8c669c9ec8bbb3a812375e02 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 14:35:26 +0200 Subject: [PATCH 24/43] refactor: remove unused utility modules and tests --- .../src/admin_api_lib/impl/utils/__init__.py | 0 .../impl/utils/comma_separated_bool_list.py | 65 ---------------- .../impl/utils/comma_separated_str_list.py | 74 ------------------- .../tests/comma_separated_bool_list_test.py | 55 -------------- .../tests/comma_separated_str_list_test.py | 49 ------------ admin-api-lib/tests/dummy_test.py | 3 + 6 files changed, 3 insertions(+), 243 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py delete mode 100644 admin-api-lib/tests/comma_separated_bool_list_test.py delete mode 100644 admin-api-lib/tests/comma_separated_str_list_test.py create mode 100644 admin-api-lib/tests/dummy_test.py diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/__init__.py b/admin-api-lib/src/admin_api_lib/impl/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py deleted file mode 100644 index df23553..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Utility module to handle comma separated string input that represents boolean values.""" - -from typing import Any - - -class CommaSeparatedBoolList(list): - """ - A subclass of list that converts comma-separated strings or lists into a list of booleans. - - Notes - ----- - - For string inputs, splits the string by commas and converts recognized true values ("true", "1", "yes") to True. - - An empty or whitespace-only string returns an empty list. - - For list inputs, each element is converted to a boolean. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[bool]: - """ - Validate and convert the input into a list of booleans. - - Parameters - ---------- - v : Any - Input value, either a comma separated string or a list. - info : Any - Additional context information (unused). - - Returns - ------- - list of bool - List of booleans parsed from the input. An empty string returns an empty list. - - Raises - ------ - ValueError - If v is not a string or list. - """ - - def str_to_bool(s: str) -> bool: - return s.lower() in ("true", "1", "yes") - - if isinstance(v, str): - if v.strip() == "": - return [] - return [str_to_bool(item.strip()) for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [bool(item) for item in v] - raise ValueError("Not a valid comma separated boolean list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py deleted file mode 100644 index 7b3a2a9..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Comma Separated String List Utility Module. - -This module provides a custom list type to validate and convert inputs into -a list of strings. It splits comma separated strings and converts list elements -to strings. - -Raises ------- -ValueError - If the provided input is neither a string nor a list. -""" - -from typing import Any - - -class CommaSeparatedStrList(list): - """ - Custom list type that validates comma separated strings. - - - If input is a string: splits by commas and strips whitespace. - - If input is a list: converts all elements to strings. - - Raises - ------ - ValueError - For invalid input type. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[str]: - """ - Convert input to a validated list of strings. - - Parameters - ---------- - v : Any - A comma-separated string or a list containing items to be converted. - info : Any - Additional contextual information (not used in current implementation). - - Returns - ------- - list of str - A list of trimmed strings. Returns an empty list for an empty or whitespace-only string. - - Raises - ------ - ValueError - If the input v is neither a string nor a list. - """ - if isinstance(v, str): - if v.strip() == "": - return [] - return [item.strip() for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [str(item) for item in v] - raise ValueError("Not a valid comma separated string list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/tests/comma_separated_bool_list_test.py b/admin-api-lib/tests/comma_separated_bool_list_test.py deleted file mode 100644 index d6a72d3..0000000 --- a/admin-api-lib/tests/comma_separated_bool_list_test.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_validate_empty_string(): - # An empty string should return an empty list. - assert CommaSeparatedBoolList.validate("", None) == [] - - -def test_validate_string_input(): - # Test a typical comma separated string. - # "true", "yes", and "1" are considered True, all others are False. - input_str = "true, false, yes, no, 1, 0, ,TRUE, YeS" - expected = [ - True, # "true" - False, # "false" - True, # "yes" - False, # "no" - True, # "1" - False, # "0" - True, # "TRUE" - True, # "YeS" - ] - # Note: extra whitespace items are ignored. - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_commas(): - # Test string with extra commas and spaces. - input_str = "true,, yes, ,false" - expected = [True, True, False] - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_list_input(): - # When input is a list, each element is cast to bool. - input_list = [0, 1, True, False, "non-empty", ""] - expected = [ - False, # bool(0) - True, # bool(1) - True, # bool(True) - False, # bool(False) - True, # bool("non-empty") - False, # bool("") - ] - result = CommaSeparatedBoolList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - # Passing a non-string and non-list should raise a ValueError. - with pytest.raises(ValueError): - CommaSeparatedBoolList.validate(123, None) diff --git a/admin-api-lib/tests/comma_separated_str_list_test.py b/admin-api-lib/tests/comma_separated_str_list_test.py deleted file mode 100644 index a86c048..0000000 --- a/admin-api-lib/tests/comma_separated_str_list_test.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList - - -def test_validate_string(): - # simple comma separated string - input_str = "a, b, c" - expected = ["a", "b", "c"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - input_str = "a" - expected = ["a"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_spaces(): - # string with extra spaces and empty items - input_str = " apple , banana , , cherry , " - expected = ["apple", "banana", "cherry"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_empty_string(): - input_str = "" - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_only_spaces(): - input_str = " " - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_list(): - input_list = [1, "2", 3.0, " test "] - expected = ["1", "2", "3.0", " test "] - result = CommaSeparatedStrList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - with pytest.raises(ValueError): - CommaSeparatedStrList.validate(12345, None) diff --git a/admin-api-lib/tests/dummy_test.py b/admin-api-lib/tests/dummy_test.py new file mode 100644 index 0000000..1428394 --- /dev/null +++ b/admin-api-lib/tests/dummy_test.py @@ -0,0 +1,3 @@ +def test_dummy() -> None: + print("Dummy test.") + assert True From 0133c00d2847990c936e966d1d1d233bf1a5a862 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 15:06:03 +0200 Subject: [PATCH 25/43] docs: enhance module docstrings and method descriptions across the admin and extractor APIs --- .../src/admin_api_lib/apis/admin_api.py | 2 + .../src/admin_api_lib/apis/admin_api_base.py | 36 ++++++++++++++-- .../src/admin_api_lib/impl/admin_api.py | 36 ++++++++++++++++ .../api_endpoints/default_source_uploader.py | 7 +--- .../admin_api_lib/models/document_status.py | 5 +-- .../models/http_validation_error.py | 4 +- .../admin_api_lib/models/key_value_pair.py | 3 +- .../src/admin_api_lib/models/status.py | 2 +- .../admin_api_lib/models/validation_error.py | 3 +- .../models/validation_error_loc_inner.py | 3 +- .../extractor_api_lib/apis/extractor_api.py | 33 +++++++++++++++ .../apis/extractor_api_base.py | 41 ++++++++++++++++++- 12 files changed, 152 insertions(+), 23 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index fc8d867..5a332be 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,3 +1,5 @@ +"""Module for the Admin API.""" + # coding: utf-8 from typing import Dict, List # noqa: F401 diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index e184692..432c457 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,4 +1,7 @@ +"""Module for the base AdminApi interface.""" + # coding: utf-8 +# flake8: noqa: D105 from typing import ClassVar, Dict, List, Tuple # noqa: F401 from typing_extensions import Annotated @@ -11,6 +14,15 @@ class BaseAdminApi: + """ + The base AdminApi interface. + + Attributes + ---------- + subclasses : ClassVar[Tuple] + A tuple that holds all subclasses of BaseAdminApi. + """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -71,12 +83,30 @@ async def upload_source( key_value_pair: List[KeyValuePair], request: Request, ) -> None: - """Uploads user selected source.""" + """ + Asynchronously uploads user selected source. + + Returns + ------- + None + """ async def upload_file( self, file: UploadFile, request: Request, ) -> None: - """Uploads user selected file.""" - ... + """ + Asynchronously uploads user-selected documents. + + Parameters + ---------- + file : UploadFile + The file object containing the source documents to be uploaded. + request : Request + The request object containing metadata about the upload request. + + Returns + ------- + None + """ diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index fbc62eb..08cc550 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -96,6 +96,26 @@ async def upload_source( request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: + """ + Asynchronously uploads user-selected source documents. + + Parameters + ---------- + source_type : StrictStr + The type of the source document to be uploaded. + name : StrictStr + The name of the source document to be uploaded. + kwargs : list[KeyValuePair] + Additional parameters required for the extractor. + request : Request + The HTTP request object containing metadata about the upload request. + source_uploader : SourceUploader + An instance of SourceUploader to handle the upload process. + + Returns + ------- + None + """ await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) @inject @@ -105,6 +125,22 @@ async def upload_file( request: Request, file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), ) -> None: + """ + Asynchronously uploads a file to the server. + + Parameters + ---------- + file : UploadFile + The file object to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. + file_uploader : FileUploader, optional + An instance of FileUploader to handle the upload process. + + Returns + ------- + None + """ await file_uploader.upload_file(str(request.base_url), file) @inject diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 2770553..1e1ed33 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -98,7 +98,7 @@ async def upload_source( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, source_type, name, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, source_type, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -113,9 +113,7 @@ async def upload_source( async def _handle_source_upload( self, source_name: str, - base_url: str, source_type: StrictStr, - name: str, kwargs: list[KeyValuePair], ): try: @@ -135,8 +133,7 @@ async def _handle_source_upload( self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents ] - # Replace old document - # deletion is allowed to fail + # Replace old document, deletion is allowed to fail with suppress(Exception): await self._document_deleter.adelete_document(source_name) diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index ff2f94a..89b09d8 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -49,8 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 7e288e1..28c83f0 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -48,9 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of HTTPValidationError from a JSON string""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 82c0c37..3d46e01 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -48,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 0ab750b..3b24b73 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error.py b/admin-api-lib/src/admin_api_lib/models/validation_error.py index f922b21..ac389ab 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error.py @@ -50,8 +50,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py index 0100c88..e487669 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -55,8 +55,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 7d09897..4f9e4e5 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,3 +1,5 @@ +"""Module for the Extractor API.""" + # coding: utf-8 from typing import Dict, List # noqa: F401 @@ -48,6 +50,19 @@ async def extract_from_file_post( extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) @@ -67,6 +82,24 @@ async def extract_from_file_post( async def extract_from_source( extraction_parameters: ExtractionParameters = Body(None, description=""), ) -> List[InformationPiece]: + """ + Extract information from a source based on the provided extraction parameters. + + Parameters + ---------- + extraction_parameters : ExtractionParameters, optional + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + + Raises + ------ + HTTPException + If the extraction process fails or encounters an error. + """ if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index 696c60c..acb6022 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -1,3 +1,5 @@ +"""Module for the base ExtractorApi interface.""" + # coding: utf-8 from typing import ClassVar, Dict, List, Tuple # noqa: F401 @@ -8,6 +10,15 @@ class BaseExtractorApi: + """ + The base ExtractorApi interface. + + Attributes + ---------- + subclasses : ClassVar[Tuple] + A tuple containing all subclasses of BaseExtractorApi. + """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -17,9 +28,35 @@ def __init_subclass__(cls, **kwargs): async def extract_from_file_post( self, extraction_request: ExtractionRequest, - ) -> List[InformationPiece]: ... + ) -> List[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ async def extract_from_source( self, extraction_parameters: ExtractionParameters, - ) -> List[InformationPiece]: ... + ) -> List[InformationPiece]: + """ + Extract information from a source based on the provided extraction request. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters required to access and extract information from the source. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ From 4bfd3f171f290b53e732ec61885cfafdf15fc279 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 08:26:11 +0200 Subject: [PATCH 26/43] working sample --- .../api_endpoints/default_file_uploader.py | 24 ++- .../api_endpoints/default_source_uploader.py | 170 +++++++++++++----- .../tests/default_source_uploader_test.py | 145 +++++++++++++++ admin-api-lib/tests/dummy_test.py | 3 - .../tests/test_default_source_uploader.py | 0 .../tests/{dummy_test.py => dummy5_test.py} | 0 rag-core-api/tests/rag_api_test.py | 26 +-- .../tests/{dummy_test.py => dummy6_test.py} | 0 8 files changed, 309 insertions(+), 59 deletions(-) create mode 100644 admin-api-lib/tests/default_source_uploader_test.py delete mode 100644 admin-api-lib/tests/dummy_test.py create mode 100644 admin-api-lib/tests/test_default_source_uploader.py rename extractor-api-lib/tests/{dummy_test.py => dummy5_test.py} (100%) rename rag-core-lib/tests/{dummy_test.py => dummy6_test.py} (100%) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index b9b367f..fed469d 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -98,7 +98,7 @@ async def upload_file( content = await file.read() file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" - # TODO: check if document already in processing state + self._check_if_already_in_processing(source_name) self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status @@ -116,6 +116,28 @@ async def upload_file( logger.error("Error while uploading %s = %s", source_name, str(e)) raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + async def _handle_source_upload( self, s3_path: Path, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1e1ed33..d6ecd5e 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,12 +1,13 @@ -from http.client import HTTPException + +from concurrent.futures import ThreadPoolExecutor import logging -from asyncio import run -from threading import Thread +import asyncio +from threading import Thread, Event from contextlib import suppress from pydantic import StrictStr -from fastapi import status - +from fastapi import status, HTTPException +from langchain_core.documents import Document from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters @@ -20,9 +21,14 @@ from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( + InformationPiece as RagInformationPiece, +) logger = logging.getLogger(__name__) +class UploadCancelled(Exception): + pass class DefaultSourceUploader(SourceUploader): @@ -63,7 +69,7 @@ def __init__( self._information_enhancer = information_enhancer self._chunker = chunker self._document_deleter = document_deleter - self._background_threads = [] + self._background_tasks = [] async def upload_source( self, @@ -71,75 +77,155 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], + timeout: float = 300.0, ) -> None: - """ - Uploads the parameters for source content extraction. - - Parameters - ---------- - base_url : str - The base url of the service. Is used to determine the download link of the source. - source_type : str - The type of the source. Is used by the extractor service to determine the correct extraction method. - name : str - Display name of the source. - kwargs : list[KeyValuePair] - List of KeyValuePair with parameters used for the extraction. + # 1) prune finished tasks + self._background_tasks = [ + (fut, ev) for fut, ev in self._background_tasks + if not fut.done() + ] - Returns - ------- - None - """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: - # TODO: check if document already in processing state - self._key_value_store.upsert( - source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, source_type, kwargs)) + self._check_if_already_in_processing(source_name) + self._key_value_store.upsert(source_name, Status.PROCESSING) + + # 1) make a stop‐event for cooperative cancellation + stop_event = Event() + + # 2) submit the real work to a ThreadPoolExecutor + loop = asyncio.get_running_loop() + # you can reuse one executor or make a new one + executor = ThreadPoolExecutor(max_workers=1) + future = loop.run_in_executor( + executor, + lambda: asyncio.run( + self._handle_source_upload( + source_name, source_type, kwargs, stop_event + ) + ) ) - thread.start() - self._background_threads.append(thread) + # track both thread‐future and its stop‐event + self._background_tasks.append((future, stop_event)) + + # 3) await with a timeout, *without* blocking the loop + try: + await asyncio.wait_for(future, timeout) + except asyncio.TimeoutError: + # mark error, signal the thread, and move on + self._key_value_store.upsert(source_name, Status.ERROR) + stop_event.set() + logger.error("Upload of %s timed out; signaled stop_event", source_name) + except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, detail=str(e) + ) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) + ) + + + def _on_upload_timeout(self, source_name: str, thread: Thread) -> None: + """ + Called by the event loop after `timeout` seconds. + Sets the stop_event so that the worker can exit cleanly. + """ + if thread.is_alive(): + logger.error("Upload of %s timed out; signaling thread to stop", source_name) + # mark as error in your store + self._key_value_store.upsert(source_name, Status.ERROR) + # signal the worker to bail out at next checkpoint + thread.stop_event.set() + + + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + + @staticmethod + def _ensure_not_cancelled(stop_event, source_name, store): + if stop_event.is_set(): + # mark as error or cancelled if you like + store.upsert(source_name, Status.ERROR) + raise UploadCancelled() async def _handle_source_upload( self, source_name: str, source_type: StrictStr, kwargs: list[KeyValuePair], + stop_event: Event ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters( + source_type=source_type, + document_name=source_name, + kwargs=[x.to_dict() for x in kwargs] + ) ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + return + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) chunked_documents = self._chunker.chunk(documents) + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] - # Replace old document, deletion is allowed to fail + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) + rag_information_pieces: list[RagInformationPiece] = [] + for doc in enhanced_documents: + rag_information_pieces.append( + self._information_mapper.document2rag_information_piece(doc) + ) + + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) with suppress(Exception): await self._document_deleter.adelete_document(source_name) self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) + + except UploadCancelled: + logger.info("Upload of %s aborted by timeout", source_name) + return except Exception as e: - self._key_value_store.upsert(source_name, Status.ERROR) - logger.error("Error while uploading %s = %s", source_name, str(e)) + # If it wasn’t our own cancellation, record the error + if stop_event.is_set(): + logger.info("Upload of %s aborted due to timeout", source_name) + else: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py new file mode 100644 index 0000000..edfa823 --- /dev/null +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -0,0 +1,145 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException +import threading, time + +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_source_uploader + +@pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + + +@pytest.mark.asyncio +async def test_handle_source_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # Setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_source.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag_piece = {"p": "v"} + information_mapper.document2rag_information_piece.return_value = dummy_rag_piece + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + + await uploader._handle_source_upload("source1", "type1", []) + + key_value_store.upsert.assert_any_call("source1", Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag_piece]) + document_deleter.adelete_document.assert_awaited_once_with("source1") + + +@pytest.mark.asyncio +async def test_handle_source_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_source.return_value = [] + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + await uploader._handle_source_upload("source2", "type2", []) + + key_value_store.upsert.assert_any_call("source2", Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + + +@pytest.mark.asyncio +async def test_upload_source_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + source_type = "typeX" + name = "Doc Name" + source_name = f"{source_type}:{sanitize_document_name(name)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + with pytest.raises(HTTPException): + await uploader.upload_source("http://base", source_type, name, []) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + + +@pytest.mark.asyncio +async def test_upload_source_not_processing_starts_thread(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + dummy_thread = MagicMock() + monkeypatch.setattr('admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', lambda *args, **kwargs: dummy_thread) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + await uploader.upload_source("http://base", "typeY", "nameY", []) + key_value_store.upsert.assert_any_call(f"typeY:{sanitize_document_name('nameY')}", Status.PROCESSING) + dummy_thread.start.assert_called_once() + +@pytest.mark.asyncio +async def test_upload_source_no_timeout(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeZ" + name = "quick" + source_name = f"{source_type}:{sanitize_document_name(name)}" + # dummy thread that finishes before timeout + dummy_thread = MagicMock() + dummy_thread.is_alive.return_value = False + monkeypatch.setattr( + 'admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', + lambda *args, **kwargs: dummy_thread + ) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + # should not raise + await uploader.upload_source("http://base", source_type, name, []) + # only PROCESSING status upserted, no ERROR + key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) + +@pytest.mark.asyncio +async def test_upload_source_timeout_error(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeTimeout" + name = "slow" + source_name = f"{source_type}:{sanitize_document_name(name)}" + # simulate slow thread sleeping 2s; patch timeout to 1s + def slow_thread_factory(*args, **kwargs): + return threading.Thread(target=lambda: time.sleep(2), daemon=True) + monkeypatch.setattr(default_source_uploader, 'Thread', slow_thread_factory) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + with pytest.raises(HTTPException) as exc: + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + assert "timed out" in exc.value.detail + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) diff --git a/admin-api-lib/tests/dummy_test.py b/admin-api-lib/tests/dummy_test.py deleted file mode 100644 index 1428394..0000000 --- a/admin-api-lib/tests/dummy_test.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_dummy() -> None: - print("Dummy test.") - assert True diff --git a/admin-api-lib/tests/test_default_source_uploader.py b/admin-api-lib/tests/test_default_source_uploader.py new file mode 100644 index 0000000..e69de29 diff --git a/extractor-api-lib/tests/dummy_test.py b/extractor-api-lib/tests/dummy5_test.py similarity index 100% rename from extractor-api-lib/tests/dummy_test.py rename to extractor-api-lib/tests/dummy5_test.py diff --git a/rag-core-api/tests/rag_api_test.py b/rag-core-api/tests/rag_api_test.py index 372709c..2cbdf8e 100644 --- a/rag-core-api/tests/rag_api_test.py +++ b/rag-core-api/tests/rag_api_test.py @@ -14,23 +14,23 @@ from qdrant_client import QdrantClient from qdrant_client.http import models -from .mock_environment_variables import mock_environment_variables -from .mock_logging_directory import mock_logging_config +from mock_environment_variables import mock_environment_variables +from mock_logging_directory import mock_logging_config mock_environment_variables() mock_logging_config() -from src.rag_core_api.main import app -from src.rag_core_api.models.chat_request import ChatRequest -from src.rag_core_api.models.chat_history import ChatHistory -from src.rag_core_api.models.chat_history_message import ChatHistoryMessage -from src.rag_core_api.models.chat_role import ChatRole -from src.rag_core_api.models.information_piece import InformationPiece -from src.rag_core_api.models.content_type import ContentType -from src.rag_core_api.models.key_value_pair import KeyValuePair -from src.rag_core_api.models.delete_request import DeleteRequest -from src.rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings -from src.rag_core_api.impl.settings.error_messages import ErrorMessages +from rag_core_api.main import app +from rag_core_api.models.chat_request import ChatRequest +from rag_core_api.models.chat_history import ChatHistory +from rag_core_api.models.chat_history_message import ChatHistoryMessage +from rag_core_api.models.chat_role import ChatRole +from rag_core_api.models.information_piece import InformationPiece +from rag_core_api.models.content_type import ContentType +from rag_core_api.models.key_value_pair import KeyValuePair +from rag_core_api.models.delete_request import DeleteRequest +from rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings +from rag_core_api.impl.settings.error_messages import ErrorMessages @pytest_asyncio.fixture diff --git a/rag-core-lib/tests/dummy_test.py b/rag-core-lib/tests/dummy6_test.py similarity index 100% rename from rag-core-lib/tests/dummy_test.py rename to rag-core-lib/tests/dummy6_test.py From c07d93957e16def55c146c2240d0e69e0582c5e0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 11:43:33 +0200 Subject: [PATCH 27/43] refactor: improve threading model in DefaultSourceUploader and update timeout handling --- .../api_endpoints/default_source_uploader.py | 100 +++++------------- .../tests/default_source_uploader_test.py | 62 +++++------ 2 files changed, 59 insertions(+), 103 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index d6ecd5e..c0d1389 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,5 +1,5 @@ -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError import logging import asyncio from threading import Thread, Event @@ -27,9 +27,6 @@ logger = logging.getLogger(__name__) -class UploadCancelled(Exception): - pass - class DefaultSourceUploader(SourceUploader): def __init__( @@ -69,7 +66,7 @@ def __init__( self._information_enhancer = information_enhancer self._chunker = chunker self._document_deleter = document_deleter - self._background_tasks = [] + self._background_threads = [] async def upload_source( self, @@ -77,46 +74,18 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - timeout: float = 300.0, + timeout: float = 3600.0, ) -> None: - # 1) prune finished tasks - self._background_tasks = [ - (fut, ev) for fut, ev in self._background_tasks - if not fut.done() - ] + self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) - # 1) make a stop‐event for cooperative cancellation - stop_event = Event() - - # 2) submit the real work to a ThreadPoolExecutor - loop = asyncio.get_running_loop() - # you can reuse one executor or make a new one - executor = ThreadPoolExecutor(max_workers=1) - future = loop.run_in_executor( - executor, - lambda: asyncio.run( - self._handle_source_upload( - source_name, source_type, kwargs, stop_event - ) - ) - ) - # track both thread‐future and its stop‐event - self._background_tasks.append((future, stop_event)) - - # 3) await with a timeout, *without* blocking the loop - try: - await asyncio.wait_for(future, timeout) - except asyncio.TimeoutError: - # mark error, signal the thread, and move on - self._key_value_store.upsert(source_name, Status.ERROR) - stop_event.set() - logger.error("Upload of %s timed out; signaled stop_event", source_name) - + thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, timeout)) + thread.start() + self._background_threads.append(thread) except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) raise HTTPException( @@ -130,19 +99,6 @@ async def upload_source( ) - def _on_upload_timeout(self, source_name: str, thread: Thread) -> None: - """ - Called by the event loop after `timeout` seconds. - Sets the stop_event so that the worker can exit cleanly. - """ - if thread.is_alive(): - logger.error("Upload of %s timed out; signaling thread to stop", source_name) - # mark as error in your store - self._key_value_store.upsert(source_name, Status.ERROR) - # signal the worker to bail out at next checkpoint - thread.stop_event.set() - - def _check_if_already_in_processing(self, source_name: str) -> None: """ Checks if the source is already in processing state. @@ -165,19 +121,30 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - @staticmethod - def _ensure_not_cancelled(stop_event, source_name, store): - if stop_event.is_set(): - # mark as error or cancelled if you like - store.upsert(source_name, Status.ERROR) - raise UploadCancelled() + def _thread_worker(self,source_name, source_type, kwargs, timeout): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + asyncio.wait_for( + self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs), + timeout=timeout + ) + ) + except asyncio.TimeoutError: + logger.error("Upload of %s timed out after %s seconds", source_name, timeout) + self._key_value_store.upsert(source_name, Status.ERROR) + except Exception as e: + logger.exception("Error while uploading %s", source_name) + self._key_value_store.upsert(source_name, Status.ERROR) + finally: + loop.close() async def _handle_source_upload( self, source_name: str, source_type: StrictStr, kwargs: list[KeyValuePair], - stop_event: Event ): try: information_pieces = self._extractor_api.extract_from_source( @@ -192,25 +159,20 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) return - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) documents: list[Document] = [] for piece in information_pieces: documents.append(self._information_mapper.extractor_information_piece2document(piece)) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) chunked_documents = self._chunker.chunk(documents) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) rag_information_pieces: list[RagInformationPiece] = [] for doc in enhanced_documents: rag_information_pieces.append( self._information_mapper.document2rag_information_piece(doc) ) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) with suppress(Exception): await self._document_deleter.adelete_document(source_name) @@ -218,14 +180,6 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) - - except UploadCancelled: - logger.info("Upload of %s aborted by timeout", source_name) - return except Exception as e: - # If it wasn’t our own cancellation, record the error - if stop_event.is_set(): - logger.info("Upload of %s aborted due to timeout", source_name) - else: - self._key_value_store.upsert(source_name, Status.ERROR) - logger.error("Error while uploading %s = %s", source_name, str(e)) + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index edfa823..51a2963 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -1,3 +1,4 @@ +import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException @@ -88,20 +89,6 @@ async def test_upload_source_already_processing_raises_error(mocks): await uploader.upload_source("http://base", source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) - -@pytest.mark.asyncio -async def test_upload_source_not_processing_starts_thread(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks - key_value_store.get_all.return_value = [] - dummy_thread = MagicMock() - monkeypatch.setattr('admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', lambda *args, **kwargs: dummy_thread) - uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper - ) - await uploader.upload_source("http://base", "typeY", "nameY", []) - key_value_store.upsert.assert_any_call(f"typeY:{sanitize_document_name('nameY')}", Status.PROCESSING) - dummy_thread.start.assert_called_once() - @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks @@ -109,21 +96,19 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): source_type = "typeZ" name = "quick" source_name = f"{source_type}:{sanitize_document_name(name)}" - # dummy thread that finishes before timeout + # patch Thread so no actual background work is done dummy_thread = MagicMock() - dummy_thread.is_alive.return_value = False - monkeypatch.setattr( - 'admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', - lambda *args, **kwargs: dummy_thread - ) + monkeypatch.setattr(default_source_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) # should not raise - await uploader.upload_source("http://base", source_type, name, []) + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) # only PROCESSING status upserted, no ERROR - key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) + dummy_thread.start.assert_called_once() + @pytest.mark.asyncio async def test_upload_source_timeout_error(mocks, monkeypatch): @@ -132,14 +117,31 @@ async def test_upload_source_timeout_error(mocks, monkeypatch): source_type = "typeTimeout" name = "slow" source_name = f"{source_type}:{sanitize_document_name(name)}" - # simulate slow thread sleeping 2s; patch timeout to 1s - def slow_thread_factory(*args, **kwargs): - return threading.Thread(target=lambda: time.sleep(2), daemon=True) - monkeypatch.setattr(default_source_uploader, 'Thread', slow_thread_factory) + # monkey-patch the handler to sleep so that timeout triggers + async def fake_handle(self, source_name_arg, source_type_arg, kwargs_arg): + await asyncio.sleep(3600) + # patch handler and Thread to trigger timeout synchronously + monkeypatch.setattr( + default_source_uploader.DefaultSourceUploader, + '_handle_source_upload', + fake_handle + ) + def FakeThread(target, args=(), **kwargs): + # this ensures serial execution, so that the error status can be checked + class T: + def start(self_inner): + target(*args) + def is_alive(self_inner): + return False + return T() + monkeypatch.setattr(default_source_uploader, 'Thread', FakeThread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) - with pytest.raises(HTTPException) as exc: - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) - assert "timed out" in exc.value.detail - key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + # no exception should be raised; timeout path sets ERROR status + + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + # first call marks PROCESSING, second marks ERROR + calls = [call.args for call in key_value_store.upsert.call_args_list] + assert (source_name, Status.PROCESSING) in calls + assert (source_name, Status.ERROR) in calls From a46b4fdb5c0f3681d65c6e6424788803f9d407d4 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 12:55:36 +0200 Subject: [PATCH 28/43] feat: add timeout parameter to file and source upload methods and enhance documentation --- .../api_endpoints/file_uploader.py | 4 + .../api_endpoints/source_uploader.py | 7 +- .../src/admin_api_lib/apis/admin_api.py | 27 +++- .../src/admin_api_lib/apis/admin_api_base.py | 10 +- .../src/admin_api_lib/impl/admin_api.py | 5 +- .../api_endpoints/default_file_uploader.py | 34 ++++- .../api_endpoints/default_source_uploader.py | 22 ++- .../tests/default_file_uploader_test.py | 142 ++++++++++++++++++ .../tests/default_source_uploader_test.py | 7 +- 9 files changed, 233 insertions(+), 25 deletions(-) create mode 100644 admin-api-lib/tests/default_file_uploader_test.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index b8594c7..d146a1b 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,6 +1,7 @@ """Module for the upload file endpoint.""" from abc import ABC, abstractmethod +from typing import Optional from fastapi import UploadFile @@ -12,6 +13,7 @@ async def upload_file( self, base_url: str, file: UploadFile, + timeout: Optional[float], ) -> None: """ Uploads a source file for content extraction. @@ -22,6 +24,8 @@ async def upload_file( The base url of the service. Is used to determine the download link of the file. file : UploadFile The file to process. + timeout : float, optional + Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index f4b4e03..95c9d6e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,6 +1,7 @@ """Module for the upload source endpoint.""" from abc import ABC, abstractmethod +from typing import Optional from pydantic import StrictStr @@ -13,24 +14,24 @@ class SourceUploader(ABC): @abstractmethod async def upload_source( self, - base_url: str, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], + timeout: Optional[float], ) -> None: """ Uploads the parameters for source content extraction. Parameters ---------- - base_url : str - The base url of the service. Is used to determine the download link of the source. source_type : str The type of the source. Is used by the extractor service to determine the correct extraction method. name : str Display name of the source. kwargs : list[KeyValuePair] List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 5a332be..a323bd6 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -148,7 +148,16 @@ async def upload_file( file: UploadFile, request: Request, ) -> None: - """Uploads user selected sources.""" + """ + Uploads user selected sources. + + Parameters + ---------- + file : UploadFile + The file to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().upload_file(file, request) @@ -167,12 +176,22 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - request: Request, source_type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: - """Uploads user selected sources.""" + """ + Uploads user selected sources. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : List[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair, request) + return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 432c457..e3841b9 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -81,11 +81,19 @@ async def upload_source( source_type: StrictStr, name: StrictStr, key_value_pair: List[KeyValuePair], - request: Request, ) -> None: """ Asynchronously uploads user selected source. + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + Returns ------- None diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 08cc550..4ecdd4c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -93,7 +93,6 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: """ @@ -107,8 +106,6 @@ async def upload_source( The name of the source document to be uploaded. kwargs : list[KeyValuePair] Additional parameters required for the extractor. - request : Request - The HTTP request object containing metadata about the upload request. source_uploader : SourceUploader An instance of SourceUploader to handle the upload process. @@ -116,7 +113,7 @@ async def upload_source( ------- None """ - await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) + await source_uploader.upload_source(source_type, name, kwargs) @inject async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index fed469d..5a61b02 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,13 +1,14 @@ -from http.client import HTTPException +import asyncio import logging from pathlib import Path import traceback from threading import Thread +from typing import Optional import urllib import tempfile from contextlib import suppress -from fastapi import UploadFile, status +from fastapi import UploadFile, status, HTTPException from langchain_core.documents import Document from asyncio import run @@ -77,6 +78,7 @@ async def upload_file( self, base_url: str, file: UploadFile, + timeout: Optional[float] = 3600.0, ) -> None: """ Uploads a source file for content extraction. @@ -99,13 +101,9 @@ async def upload_file( file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" self._check_if_already_in_processing(source_name) - self._key_value_store.upsert( - source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status + self._key_value_store.upsert(source_name, Status.PROCESSING) s3_path = await self._asave_new_document(content, file.filename, source_name) - thread = Thread( - target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) - ) + thread = Thread(target=self._thread_worker, args=(s3_path, source_name, file.filename, base_url, timeout)) thread.start() self._background_threads.append(thread) except ValueError as e: @@ -138,6 +136,25 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") + def _thread_worker(self,s3_path, source_name, filename, base_url, timeout): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + asyncio.wait_for( + self._handle_source_upload(s3_path, source_name, filename, base_url), + timeout=timeout + ) + ) + except asyncio.TimeoutError: + logger.error("Upload of %s timed out after %s seconds", source_name, timeout) + self._key_value_store.upsert(source_name, Status.ERROR) + except Exception as e: + logger.exception("Error while uploading %s", source_name) + self._key_value_store.upsert(source_name, Status.ERROR) + finally: + loop.close() + async def _handle_source_upload( self, s3_path: Path, @@ -153,6 +170,7 @@ async def _handle_source_upload( if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) + raise Exception("No information pieces found") documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] chunked_documents = self._chunker.chunk(documents) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index c0d1389..c91fd75 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -70,12 +70,30 @@ def __init__( async def upload_source( self, - base_url: str, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], timeout: float = 3600.0, ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation, by default 3600.0 seconds (1 hour). + + Returns + ------- + None + """ + self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" @@ -158,7 +176,7 @@ async def _handle_source_upload( if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) - return + raise Exception("No information pieces found") documents: list[Document] = [] for piece in information_pieces: documents.append(self._information_mapper.extractor_information_piece2document(piece)) diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py new file mode 100644 index 0000000..8cceb14 --- /dev/null +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -0,0 +1,142 @@ +import asyncio +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException +from fastapi import UploadFile +import threading, time + +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_file_uploader + +@ pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + +@ pytest.mark.asyncio +async def test_handle_file_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_file_post.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag = {"foo": "bar"} + information_mapper.document2rag_information_piece.return_value = dummy_rag + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader._handle_source_upload("s3path", "file:doc1", "doc1.txt", "http://base") + + key_value_store.upsert.assert_any_call("file:doc1", Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag]) + document_deleter.adelete_document.assert_awaited_once_with("file:doc1") + +@ pytest.mark.asyncio +async def test_handle_file_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_file_post.return_value = [] + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + filename = "file:doc2" + await uploader._handle_source_upload("s3path", filename, "doc2.txt", "http://base") + + key_value_store.upsert.assert_any_call(filename, Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + +@ pytest.mark.asyncio +async def test_upload_file_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc3.txt" + file.read = AsyncMock(return_value=b"") + source_name = f"file:{sanitize_document_name(file.filename)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + with pytest.raises(HTTPException): + await uploader.upload_file(base_url, file) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + +@ pytest.mark.asyncio +async def test_upload_file_starts_thread(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc4.txt" + file.read = AsyncMock(return_value=b"content") + key_value_store.get_all.return_value = [] + source_name = f"file:{sanitize_document_name(file.filename)}" + + dummy_thread = MagicMock() + monkeypatch.setattr(default_file_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader.upload_file(base_url, file) + + key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + dummy_thread.start.assert_called_once() + +@ pytest.mark.asyncio +async def test_upload_file_timeout_error(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "slow.txt" + file.read = AsyncMock(return_value=b"") + key_value_store.get_all.return_value = [] + source_name = f"file:{sanitize_document_name(file.filename)}" + + # fast fake handler that sleeps long + async def fake_handle(self, s3_path, source_name_arg, filename, base_url_arg): + await asyncio.sleep(3600) + monkeypatch.setattr( + default_file_uploader.DefaultFileUploader, + '_handle_source_upload', + fake_handle + ) + def FakeThread(target, args=(), **kwargs): + class T: + def start(self_inner): target(*args) + def is_alive(self_inner): return False + return T() + monkeypatch.setattr(default_file_uploader, 'Thread', FakeThread) + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader.upload_file(base_url, file, timeout=0.1) + calls = [c.args for c in key_value_store.upsert.call_args_list] + assert (source_name, Status.PROCESSING) in calls + assert (source_name, Status.ERROR) in calls diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index 51a2963..9210a0c 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -86,7 +86,8 @@ async def test_upload_source_already_processing_raises_error(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) with pytest.raises(HTTPException): - await uploader.upload_source("http://base", source_type, name, []) + # use default timeout + await uploader.upload_source(source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) @pytest.mark.asyncio @@ -103,7 +104,7 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) # should not raise - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + await uploader.upload_source(source_type, name, [], timeout=1.0) # only PROCESSING status upserted, no ERROR assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) @@ -140,7 +141,7 @@ def is_alive(self_inner): ) # no exception should be raised; timeout path sets ERROR status - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + await uploader.upload_source(source_type, name, [], timeout=1.0) # first call marks PROCESSING, second marks ERROR calls = [call.args for call in key_value_store.upsert.call_args_list] assert (source_name, Status.PROCESSING) in calls From e7599d16bd239fe1e7036da3f611523395736000 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:36:05 +0200 Subject: [PATCH 29/43] feat: implement UploaderBase class and enhance document deletion logic with optional key-value store removal --- admin-api-lib/docs/thread_management.md | 0 .../examples/thread_management_example.py | 0 admin-api-lib/pyproject.toml | 2 +- .../api_endpoints/document_deleter.py | 4 +- .../api_endpoints/file_uploader.py | 6 +- .../api_endpoints/source_uploader.py | 7 +- .../api_endpoints/uploader_base.py | 30 ++++++ .../api_endpoints/default_document_deleter.py | 7 +- .../api_endpoints/default_file_uploader.py | 35 ++---- .../api_endpoints/default_source_uploader.py | 41 +++---- .../managed_page_summary_enhancer.py | 0 .../models/http_validation_error.py | 1 + .../admin_api_lib/utils/thread_diagnostics.py | 0 .../tests/default_file_uploader_test.py | 100 +++++++++--------- .../tests/default_source_uploader_test.py | 28 ++--- .../tests/test_confluence_integration.py | 0 admin-api-lib/tests/test_thread_management.py | 0 extractor-api-lib/poetry.lock | 14 +-- extractor-api-lib/pyproject.toml | 2 +- .../apis/extractor_api_base.py | 2 +- .../api_endpoints/general_source_extractor.py | 4 +- rag-core-lib/poetry.lock | 14 +-- rag-core-lib/pyproject.toml | 1 + 23 files changed, 159 insertions(+), 139 deletions(-) create mode 100644 admin-api-lib/docs/thread_management.md create mode 100644 admin-api-lib/examples/thread_management_example.py create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py create mode 100644 admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py create mode 100644 admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py create mode 100644 admin-api-lib/tests/test_confluence_integration.py create mode 100644 admin-api-lib/tests/test_thread_management.py diff --git a/admin-api-lib/docs/thread_management.md b/admin-api-lib/docs/thread_management.md new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/examples/thread_management_example.py b/admin-api-lib/examples/thread_management_example.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index ec0de57..2668032 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -29,7 +29,7 @@ per-file-ignores = """ ./src/admin_api_lib/impl/admin_api.py: B008, ./src/admin_api_lib/dependency_container.py: CCE002,CCE001, ./src/admin_api_lib/apis/admin_api_base.py: WOT001, - ./tests/*: S101,S106,D100,D103,PT011 + ./tests/*: S101,S106,D100,D103,PT011,N802 ./src/admin_api_lib/impl/settings/confluence_settings.py: C901,N805, ./src/admin_api_lib/impl/utils/comma_separated_bool_list.py: R505, ./src/admin_api_lib/impl/utils/comma_separated_str_list.py: R505, diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py index 155baf0..3f222bc 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py @@ -7,7 +7,7 @@ class DocumentDeleter(ABC): """Abstract base class for document deletion endpoint.""" @abstractmethod - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Delete a document by its identification asynchronously. @@ -15,6 +15,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index d146a1b..f45636e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,12 +1,14 @@ """Module for the upload file endpoint.""" -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Optional from fastapi import UploadFile +from admin_api_lib.api_endpoints.uploader_base import UploaderBase -class FileUploader(ABC): + +class FileUploader(UploaderBase): @abstractmethod async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 95c9d6e..5a1c50a 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,15 +1,16 @@ """Module for the upload source endpoint.""" -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Optional from pydantic import StrictStr +from admin_api_lib.api_endpoints.uploader_base import UploaderBase from admin_api_lib.models.key_value_pair import KeyValuePair -class SourceUploader(ABC): - """Abstract base class for source upload.""" +class SourceUploader(UploaderBase): + """Abstract base class for source uploader API endpoints.""" @abstractmethod async def upload_source( diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py new file mode 100644 index 0000000..a344dcc --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py @@ -0,0 +1,30 @@ +"""Module for the base class of uploader API endpoints.""" + +from threading import Thread + + +class UploaderBase: + """Base class for uploader API endpoints.""" + + def __init__(self): + """ + Initialize the UploaderBase. + """ + self._background_threads = [] + + def _prune_background_threads(self) -> list[Thread]: + """ + Prune background threads that are no longer running. + + Returns + ------- + list[Thread] + A list of background threads that are still alive. + """ + tmp_background_threads = [] + for thread in self._background_threads: + if not thread.is_alive(): + thread.join() + else: + tmp_background_threads.append(thread) + self._background_threads = tmp_background_threads diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py index 9f3c414..3cf671f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py @@ -41,7 +41,7 @@ def __init__(self, file_service: FileService, rag_api: RagApi, key_value_store: self._rag_api = rag_api self._key_value_store = key_value_store - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Asynchronously delete a document identified by the given identification string. @@ -55,6 +55,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Raises ------ @@ -66,7 +68,8 @@ async def adelete_document(self, identification: str) -> None: # Delete the document from file service and vector database logger.debug("Deleting existing document: %s", identification) try: - self._key_value_store.remove(identification) + if remove_from_key_value_store: + self._key_value_store.remove(identification) self._file_service.delete_file(identification) except Exception as e: error_messages += f"Error while deleting {identification} from file storage\n {str(e)}\n" diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 5a61b02..b558f11 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,4 +1,3 @@ -import asyncio import logging from pathlib import Path import traceback @@ -64,6 +63,7 @@ def __init__( file_service : FileService The service for handling file operations on the S3 storage """ + super().__init__() self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -94,16 +94,18 @@ async def upload_file( ------- None """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] + self._prune_background_threads() try: - content = await file.read() file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) + content = await file.read() s3_path = await self._asave_new_document(content, file.filename, source_name) - thread = Thread(target=self._thread_worker, args=(s3_path, source_name, file.filename, base_url, timeout)) + thread = Thread( + target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) + ) #TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior thread.start() self._background_threads.append(thread) except ValueError as e: @@ -136,25 +138,6 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - def _thread_worker(self,s3_path, source_name, filename, base_url, timeout): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete( - asyncio.wait_for( - self._handle_source_upload(s3_path, source_name, filename, base_url), - timeout=timeout - ) - ) - except asyncio.TimeoutError: - logger.error("Upload of %s timed out after %s seconds", source_name, timeout) - self._key_value_store.upsert(source_name, Status.ERROR) - except Exception as e: - logger.exception("Error while uploading %s", source_name) - self._key_value_store.upsert(source_name, Status.ERROR) - finally: - loop.close() - async def _handle_source_upload( self, s3_path: Path, @@ -171,7 +154,9 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) raise Exception("No information pieces found") - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) chunked_documents = self._chunker.chunk(documents) @@ -184,7 +169,7 @@ async def _handle_source_upload( # Replace old document # deletion is allowed to fail with suppress(Exception): - await self._document_deleter.adelete_document(source_name) + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index c91fd75..71b09de 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,8 +1,6 @@ - -from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError import logging import asyncio -from threading import Thread, Event +from threading import Thread from contextlib import suppress from pydantic import StrictStr @@ -27,6 +25,7 @@ logger = logging.getLogger(__name__) + class DefaultSourceUploader(SourceUploader): def __init__( @@ -59,6 +58,7 @@ def __init__( information_mapper : InformationPiece2Document The mapper for converting information pieces to langchain documents. """ + super().__init__() self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -94,7 +94,7 @@ async def upload_source( None """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] + self._prune_background_threads() source_name = f"{source_type}:{sanitize_document_name(name)}" try: @@ -106,16 +106,11 @@ async def upload_source( self._background_threads.append(thread) except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, detail=str(e) - ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) - ) - + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) def _check_if_already_in_processing(self, source_name: str) -> None: """ @@ -139,21 +134,21 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - def _thread_worker(self,source_name, source_type, kwargs, timeout): + def _thread_worker(self, source_name, source_type, kwargs, timeout): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: loop.run_until_complete( asyncio.wait_for( self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs), - timeout=timeout + timeout=timeout, ) ) except asyncio.TimeoutError: logger.error("Upload of %s timed out after %s seconds", source_name, timeout) self._key_value_store.upsert(source_name, Status.ERROR) - except Exception as e: - logger.exception("Error while uploading %s", source_name) + except Exception: + logger.error("Error while uploading %s", source_name) self._key_value_store.upsert(source_name, Status.ERROR) finally: loop.close() @@ -167,9 +162,7 @@ async def _handle_source_upload( try: information_pieces = self._extractor_api.extract_from_source( ExtractionParameters( - source_type=source_type, - document_name=source_name, - kwargs=[x.to_dict() for x in kwargs] + source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs] ) ) @@ -183,19 +176,19 @@ async def _handle_source_upload( chunked_documents = self._chunker.chunk(documents) - enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + # limit concurrency to avoid spawning multiple threads per call + enhanced_documents = await self._information_enhancer.ainvoke( + chunked_documents, config={"max_concurrency": 1} + ) rag_information_pieces: list[RagInformationPiece] = [] for doc in enhanced_documents: - rag_information_pieces.append( - self._information_mapper.document2rag_information_piece(doc) - ) + rag_information_pieces.append(self._information_mapper.document2rag_information_piece(doc)) with suppress(Exception): - await self._document_deleter.adelete_document(source_name) + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) self._rag_api.upload_information_piece(rag_information_pieces) - self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) except Exception as e: diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 28c83f0..7d5feeb 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -49,6 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" return self.model_dump_json(by_alias=True, exclude_unset=True) + @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of HTTPValidationError from a JSON string""" diff --git a/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py b/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index 8cceb14..19318e9 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -3,14 +3,14 @@ from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException from fastapi import UploadFile -import threading, time from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader from admin_api_lib.models.status import Status from admin_api_lib.utils.utils import sanitize_document_name from admin_api_lib.impl.api_endpoints import default_file_uploader -@ pytest.fixture + +@pytest.fixture def mocks(): extractor_api = MagicMock() key_value_store = MagicMock() @@ -24,7 +24,8 @@ def mocks(): information_mapper = MagicMock() return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_handle_file_upload_success(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks # setup mocks @@ -38,24 +39,39 @@ async def test_handle_file_upload_success(mocks): information_mapper.document2rag_information_piece.return_value = dummy_rag uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) - await uploader._handle_source_upload("s3path", "file:doc1", "doc1.txt", "http://base") + upload_filename = "file:doc1" + + await uploader._handle_source_upload("s3path", upload_filename, "doc1.txt", "http://base") - key_value_store.upsert.assert_any_call("file:doc1", Status.READY) + key_value_store.upsert.assert_any_call(upload_filename, Status.READY) rag_api.upload_information_piece.assert_called_once_with([dummy_rag]) - document_deleter.adelete_document.assert_awaited_once_with("file:doc1") + document_deleter.adelete_document.assert_awaited_once_with(upload_filename, remove_from_key_value_store=False) -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_handle_file_upload_no_info_pieces(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks extractor_api.extract_from_file_post.return_value = [] uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) filename = "file:doc2" await uploader._handle_source_upload("s3path", filename, "doc2.txt", "http://base") @@ -64,7 +80,8 @@ async def test_handle_file_upload_no_info_pieces(mocks): information_mapper.extractor_information_piece2document.assert_not_called() rag_api.upload_information_piece.assert_not_called() -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_upload_file_already_processing_raises_error(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks base_url = "http://base" @@ -75,15 +92,22 @@ async def test_upload_file_already_processing_raises_error(mocks): key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) with pytest.raises(HTTPException): await uploader.upload_file(base_url, file) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_upload_file_starts_thread(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks base_url = "http://base" @@ -94,11 +118,17 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): source_name = f"file:{sanitize_document_name(file.filename)}" dummy_thread = MagicMock() - monkeypatch.setattr(default_file_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + monkeypatch.setattr(default_file_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) await uploader.upload_file(base_url, file) @@ -106,37 +136,3 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) dummy_thread.start.assert_called_once() -@ pytest.mark.asyncio -async def test_upload_file_timeout_error(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks - base_url = "http://base" - file = MagicMock(spec=UploadFile) - file.filename = "slow.txt" - file.read = AsyncMock(return_value=b"") - key_value_store.get_all.return_value = [] - source_name = f"file:{sanitize_document_name(file.filename)}" - - # fast fake handler that sleeps long - async def fake_handle(self, s3_path, source_name_arg, filename, base_url_arg): - await asyncio.sleep(3600) - monkeypatch.setattr( - default_file_uploader.DefaultFileUploader, - '_handle_source_upload', - fake_handle - ) - def FakeThread(target, args=(), **kwargs): - class T: - def start(self_inner): target(*args) - def is_alive(self_inner): return False - return T() - monkeypatch.setattr(default_file_uploader, 'Thread', FakeThread) - - uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() - ) - - await uploader.upload_file(base_url, file, timeout=0.1) - calls = [c.args for c in key_value_store.upsert.call_args_list] - assert (source_name, Status.PROCESSING) in calls - assert (source_name, Status.ERROR) in calls diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index 9210a0c..9c47416 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -1,14 +1,16 @@ +# ignore: + import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException -import threading, time from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.models.status import Status from admin_api_lib.utils.utils import sanitize_document_name from admin_api_lib.impl.api_endpoints import default_source_uploader + @pytest.fixture def mocks(): extractor_api = MagicMock() @@ -51,7 +53,7 @@ async def test_handle_source_upload_success(mocks): key_value_store.upsert.assert_any_call("source1", Status.READY) rag_api.upload_information_piece.assert_called_once_with([dummy_rag_piece]) - document_deleter.adelete_document.assert_awaited_once_with("source1") + document_deleter.adelete_document.assert_awaited_once_with("source1", remove_from_key_value_store=False) @pytest.mark.asyncio @@ -90,16 +92,16 @@ async def test_upload_source_already_processing_raises_error(mocks): await uploader.upload_source(source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks key_value_store.get_all.return_value = [] source_type = "typeZ" name = "quick" - source_name = f"{source_type}:{sanitize_document_name(name)}" # patch Thread so no actual background work is done dummy_thread = MagicMock() - monkeypatch.setattr(default_source_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + monkeypatch.setattr(default_source_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) @@ -118,24 +120,26 @@ async def test_upload_source_timeout_error(mocks, monkeypatch): source_type = "typeTimeout" name = "slow" source_name = f"{source_type}:{sanitize_document_name(name)}" + # monkey-patch the handler to sleep so that timeout triggers async def fake_handle(self, source_name_arg, source_type_arg, kwargs_arg): await asyncio.sleep(3600) + # patch handler and Thread to trigger timeout synchronously - monkeypatch.setattr( - default_source_uploader.DefaultSourceUploader, - '_handle_source_upload', - fake_handle - ) + monkeypatch.setattr(default_source_uploader.DefaultSourceUploader, "_handle_source_upload", fake_handle) + def FakeThread(target, args=(), **kwargs): # this ensures serial execution, so that the error status can be checked class T: - def start(self_inner): + def start(self): target(*args) - def is_alive(self_inner): + + def is_alive(self): return False + return T() - monkeypatch.setattr(default_source_uploader, 'Thread', FakeThread) + + monkeypatch.setattr(default_source_uploader, "Thread", FakeThread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) diff --git a/admin-api-lib/tests/test_confluence_integration.py b/admin-api-lib/tests/test_confluence_integration.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/tests/test_thread_management.py b/admin-api-lib/tests/test_thread_management.py new file mode 100644 index 0000000..e69de29 diff --git a/extractor-api-lib/poetry.lock b/extractor-api-lib/poetry.lock index 0da6009..c750e96 100644 --- a/extractor-api-lib/poetry.lock +++ b/extractor-api-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1933,21 +1933,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -4877,4 +4877,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "9dd34ca058d74aea96a5ebfc2d712ec2a36521b310858dcb5e5569bb2dd16333" +content-hash = "a25945d5914b2ad6c32bcd50f8b787c00e41df7e09fdb3c991f48cb9e9c15c72" diff --git a/extractor-api-lib/pyproject.toml b/extractor-api-lib/pyproject.toml index 4d6ac63..a648858 100644 --- a/extractor-api-lib/pyproject.toml +++ b/extractor-api-lib/pyproject.toml @@ -92,7 +92,7 @@ html5lib = "^1.1" langchain-community = "^0.3.23" atlassian-python-api = "^4.0.3" markdownify = "^1.1.0" -langchain-core = "^0.3.58" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index acb6022..800c214 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -29,7 +29,7 @@ async def extract_from_file_post( self, extraction_request: ExtractionRequest, ) -> List[InformationPiece]: - """ + """ Extract information from a file based on the provided extraction request. Parameters diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 8e08ad6..10d8cd5 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -51,7 +51,9 @@ async def aextract_information( list[InformationPiece] A list of extracted information pieces. """ - correct_extractors = [x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type] + correct_extractors = [ + x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type + ] if not correct_extractors: raise ValueError(f"No extractor found for type {extraction_parameters.source_type}") results = await correct_extractors[-1].aextract_content(extraction_parameters) diff --git a/rag-core-lib/poetry.lock b/rag-core-lib/poetry.lock index 90b3fb9..4487b8e 100644 --- a/rag-core-lib/poetry.lock +++ b/rag-core-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1623,21 +1623,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3384,4 +3384,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "2aa5df2f5304dfb56d7adfeeb4f8817ecf9d7eaaadc5af9127875a5aa442c7d0" +content-hash = "265d9eb8b910f4831f5e5e7e78a0e9b3b010793fed03d30a96393a2f8c1792db" diff --git a/rag-core-lib/pyproject.toml b/rag-core-lib/pyproject.toml index c63b316..2ca85e3 100644 --- a/rag-core-lib/pyproject.toml +++ b/rag-core-lib/pyproject.toml @@ -21,6 +21,7 @@ requests-oauthlib = "^2.0.0" langfuse = "^2.60.4" deprecated = "^1.2.18" openai = "^1.77.0" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] From 7f1df26ed0db4b8f0fe439b22be86a984bb5853f Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:40:18 +0200 Subject: [PATCH 30/43] refactor: add TODO for implementing timeout in thread handling for file uploads --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 2 +- admin-api-lib/tests/default_file_uploader_test.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index b558f11..2c6f868 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -105,7 +105,7 @@ async def upload_file( s3_path = await self._asave_new_document(content, file.filename, source_name) thread = Thread( target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) - ) #TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior + ) # TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index 19318e9..e76b9b5 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -135,4 +135,3 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) dummy_thread.start.assert_called_once() - From 8a6d4f16f4b624ac3d816e74470aef4a25791cec Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:40:46 +0200 Subject: [PATCH 31/43] refactor: remove unused asyncio import from default_file_uploader_test.py --- admin-api-lib/tests/default_file_uploader_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index e76b9b5..079a935 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -1,4 +1,3 @@ -import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException From 5af5c76717d4b9bbdad9939349798b0b47f22b2e Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:44:34 +0200 Subject: [PATCH 32/43] refactor: remove unused thread management documentation and example files --- admin-api-lib/docs/thread_management.md | 0 admin-api-lib/examples/thread_management_example.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 admin-api-lib/docs/thread_management.md delete mode 100644 admin-api-lib/examples/thread_management_example.py diff --git a/admin-api-lib/docs/thread_management.md b/admin-api-lib/docs/thread_management.md deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/examples/thread_management_example.py b/admin-api-lib/examples/thread_management_example.py deleted file mode 100644 index e69de29..0000000 From fa2f9282e3403e1de5a5729056765a0c598a71a5 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:12:17 +0200 Subject: [PATCH 33/43] chore: update poetry.lock and pyproject.toml for dependency version changes and configuration adjustments --- admin-api-lib/tests/test_default_source_uploader.py | 0 rag-core-api/poetry.lock | 13 +++++++------ rag-core-api/pyproject.toml | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) delete mode 100644 admin-api-lib/tests/test_default_source_uploader.py diff --git a/admin-api-lib/tests/test_default_source_uploader.py b/admin-api-lib/tests/test_default_source_uploader.py deleted file mode 100644 index e69de29..0000000 diff --git a/rag-core-api/poetry.lock b/rag-core-api/poetry.lock index e5ea53f..9812609 100644 --- a/rag-core-api/poetry.lock +++ b/rag-core-api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1959,21 +1959,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3843,6 +3843,7 @@ deprecated = "^1.2.18" flashrank = "^0.2.10" langchain = "^0.3.25" langchain-community = "0.3.23" +langchain-core = "0.3.63" langfuse = "^2.60.4" oauthlib = "^3.2.2" openai = "^1.77.0" diff --git a/rag-core-api/pyproject.toml b/rag-core-api/pyproject.toml index 4fd633c..2194a90 100644 --- a/rag-core-api/pyproject.toml +++ b/rag-core-api/pyproject.toml @@ -118,8 +118,8 @@ known_local_folder = ["rag_core_api", "rag_core_lib"] max-line-length = 120 [tool.pytest.ini_options] -log_cli = 1 +log_cli = true log_cli_level = "DEBUG" -pythonpath = "src" -testpaths = "src/tests" +pythonpath = ["src", "tests"] +testpaths = "tests" From 8dc79900949b7997ecba92442052712a60c3dd4f Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:16:19 +0200 Subject: [PATCH 34/43] chore: Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7becbcd..3d3edf6 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ The extracted information will be summarized using a LLM. The summary, as well a #### `/upload_source` -Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). From 57788eb4829997f05e74ce447cf908619aa142a4 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:21:52 +0200 Subject: [PATCH 35/43] fix: correct spelling of 'arbitrary' in README and update query parameter alias in upload_source function --- README.md | 2 +- admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py | 3 --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7becbcd..3d3edf6 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ The extracted information will be summarized using a LLM. The summary, as well a #### `/upload_source` -Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index f45636e..2260bd4 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -15,7 +15,6 @@ async def upload_file( self, base_url: str, file: UploadFile, - timeout: Optional[float], ) -> None: """ Uploads a source file for content extraction. @@ -26,8 +25,6 @@ async def upload_file( The base url of the service. Is used to determine the download link of the file. file : UploadFile The file to process. - timeout : float, optional - Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index a323bd6..d67a246 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,7 +176,7 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="", alias="type"), + source_type: StrictStr = Query(None, description="", alias="sourceType"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: From d942cf71a641838262630198a8364201574e5fba Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:24:25 +0200 Subject: [PATCH 36/43] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d3edf6..f246276 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ The type of information that is extracted will vary depending on the source, the - `TEXT`: plain text - `TABLE`: data in tabular form found in the document -- `IMAGE`: data in tabular form found in the document +- `IMAGE`: image found in the document ### 3.3 Replaceable parts From 21d10d9246a6cde6c40e43ca29b1a391b7c7de51 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:27:45 +0200 Subject: [PATCH 37/43] refactor: remove unused import and enhance query parameter descriptions in upload_source function --- .../src/admin_api_lib/api_endpoints/file_uploader.py | 1 - admin-api-lib/src/admin_api_lib/apis/admin_api.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 2260bd4..3ab7464 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,7 +1,6 @@ """Module for the upload file endpoint.""" from abc import abstractmethod -from typing import Optional from fastapi import UploadFile diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index d67a246..a8979af 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,9 +176,9 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="", alias="sourceType"), - name: StrictStr = Query(None, description="", alias="name"), - key_value_pair: List[KeyValuePair] = Body(None, description=""), + source_type: StrictStr = Query(None, description="The type of the source", alias="sourceType"), + name: StrictStr = Query(None, description="The name of the source", alias="name"), + key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"), ) -> None: """ Uploads user selected sources. From bc503f24917f0c95fe36ac3f428488544b3f0269 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:31:29 +0200 Subject: [PATCH 38/43] refactor: remove timeout parameter from DefaultFileUploader and delete unused managed_page_summary_enhancer module --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 1 - .../impl/information_enhancer/managed_page_summary_enhancer.py | 0 2 files changed, 1 deletion(-) delete mode 100644 admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 2c6f868..5217501 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -78,7 +78,6 @@ async def upload_file( self, base_url: str, file: UploadFile, - timeout: Optional[float] = 3600.0, ) -> None: """ Uploads a source file for content extraction. diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py deleted file mode 100644 index e69de29..0000000 From a5523fbb0ecfadc9baa362ff80007a6c57c005b0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:33:07 +0200 Subject: [PATCH 39/43] refactor: remove unused thread_diagnostics.py file --- admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py diff --git a/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py b/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py deleted file mode 100644 index e69de29..0000000 From 5dafd3e20bb25464b203d967cbbd39da1a650e28 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 10:03:37 +0200 Subject: [PATCH 40/43] feat: add SourceUploaderSettings for configurable timeout and refactor DefaultSourceUploader to use it refactor: update JSON serialization in ExtractionParameters, ExtractionRequest, InformationPiece, and KeyValuePair models refactor: remove unused test files for confluence and thread management integration --- .../api_endpoints/default_source_uploader.py | 6 ++-- .../impl/settings/source_uploader_settings.py | 23 ++++++++++++++ .../tests/test_confluence_integration.py | 0 admin-api-lib/tests/test_thread_management.py | 0 .../impl/extractor_api_impl.py | 30 +++++++++++++++++++ .../models/extraction_parameters.py | 3 +- .../models/extraction_request.py | 3 +- .../models/information_piece.py | 3 +- .../models/key_value_pair.py | 3 +- 9 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py delete mode 100644 admin-api-lib/tests/test_confluence_integration.py delete mode 100644 admin-api-lib/tests/test_thread_management.py diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 71b09de..bc891b7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -9,6 +9,7 @@ from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document @@ -37,6 +38,7 @@ def __init__( document_deleter: DocumentDeleter, rag_api: RagApi, information_mapper: InformationPiece2Document, + settings: SourceUploaderSettings, ): """ Initialize the DefaultSourceUploader. @@ -67,13 +69,13 @@ def __init__( self._chunker = chunker self._document_deleter = document_deleter self._background_threads = [] + self._settings = settings async def upload_source( self, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - timeout: float = 3600.0, ) -> None: """ Uploads the parameters for source content extraction. @@ -101,7 +103,7 @@ async def upload_source( self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) - thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, timeout)) + thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, self._settings.timeout)) thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py new file mode 100644 index 0000000..70f18bd --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py @@ -0,0 +1,23 @@ +"""Contains settings regarding the SourceUploader.""" + +from pydantic import Field +from pydantic_settings import BaseSettings + + +class SourceUploaderSettings(BaseSettings): + """ + Contains settings regarding the SourceUploader. + + Attributes + ---------- + timeout : float + The timeout for the SourceUploader. + """ + + class Config: + """Config class for reading Fields from env.""" + + env_prefix = "SOURCE_UPLOADER_" + case_sensitive = False + + timeout: float = Field(default=3600.0, description="Timeout for the SourceUploader in seconds.") diff --git a/admin-api-lib/tests/test_confluence_integration.py b/admin-api-lib/tests/test_confluence_integration.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/tests/test_thread_management.py b/admin-api-lib/tests/test_thread_management.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index 276f720..b1aa8c1 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -21,6 +21,21 @@ async def extract_from_file_post( extraction_request: ExtractionRequest, extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), ) -> list[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request containing details about the extraction process. + extractor : FileExtractor, optional + The file extractor dependency. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ return await extractor.aextract_information(extraction_request) async def extract_from_source( @@ -28,4 +43,19 @@ async def extract_from_source( extraction_parameters: ExtractionParameters, extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), ) -> list[InformationPiece]: + """ + Extract information from a source (e.g. confluence) asynchronously. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + Parameters required to extract information from source. + extractor : SourceExtractor, optional + The source extractor instance. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index e18a452..e903b4e 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -50,8 +50,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 769b658..3befa42 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -46,8 +46,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 8890a13..3ffb308 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -51,8 +51,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index f751313..3cba505 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -46,8 +46,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: From 4ab029b00dc180a43b6cb4f27a6ac4ea447b07b8 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 10:43:15 +0200 Subject: [PATCH 41/43] refactor: remove unused import of Optional in default_file_uploader.py --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 5217501..fa4a27a 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -2,7 +2,6 @@ from pathlib import Path import traceback from threading import Thread -from typing import Optional import urllib import tempfile from contextlib import suppress From 7f53875890aa06353b99933aca317bb553dd821b Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 11:04:46 +0200 Subject: [PATCH 42/43] feat: add SourceUploaderSettings to DependencyContainer and update upload_source function --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- admin-api-lib/src/admin_api_lib/dependency_container.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index a8979af..c348b5d 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,7 +176,7 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="The type of the source", alias="sourceType"), + source_type: StrictStr = Query(None, description="The type of the source"), name: StrictStr = Query(None, description="The name of the source", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 640ea72..fd5e0a1 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -49,6 +49,7 @@ from admin_api_lib.impl.settings.key_value_settings import KeyValueSettings from admin_api_lib.impl.settings.rag_api_settings import RAGAPISettings from admin_api_lib.impl.settings.s3_settings import S3Settings +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings from admin_api_lib.impl.settings.summarizer_settings import SummarizerSettings from admin_api_lib.impl.summarizer.langchain_summarizer import LangchainSummarizer from admin_api_lib.prompt_templates.summarize_prompt import SUMMARIZE_PROMPT @@ -85,6 +86,7 @@ class DependencyContainer(DeclarativeContainer): rag_api_settings = RAGAPISettings() key_value_store_settings = KeyValueSettings() summarizer_settings = SummarizerSettings() + source_uploader_settings = SourceUploaderSettings() key_value_store = Singleton(FileStatusKeyValueStore, key_value_store_settings) file_service = Singleton(S3Service, s3_settings=s3_settings) @@ -167,6 +169,7 @@ class DependencyContainer(DeclarativeContainer): chunker=chunker, key_value_store=key_value_store, document_deleter=document_deleter, + settings=source_uploader_settings, ) file_uploader = Singleton( From 97bdb25ee46f86682c9536e721d7265a63d8e9f0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 13:46:11 +0200 Subject: [PATCH 43/43] docs: update README to clarify upload behavior and default timeout configuration --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f246276..38a9349 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ The extracted information will be summarized using a LLM. The summary, as well a Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). -The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). +The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). An is configured. Defaults to 3600 seconds (1 hour). Can be adjusted by values in the helm chart. ### 2.3 Replaceable parts