diff --git a/README.md b/README.md index 13d9231..38a9349 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,8 @@ The following endpoints are provided by the *admin-api-lib*: - `/delete_document/{identification}`: Deletes the file from storage (if applicable) and vector database. The `identification` can be retrieved from the `/all_documents_status` endpoint. - `/document_reference/{identification}`: Returns the document. - `/all_documents_status`: Return the `identification` and status of all available sources. -- `/upload_documents`: Endpoint to upload files. -- `/load_confluence`: Endpoint to load a confluence space +- `/upload_file`: Endpoint to upload files. +- `/upload_source`: Endpoint to upload non-file sources. ### 2.1 Requirements @@ -135,15 +135,16 @@ Will return the source document stored in the connected storage system. Will return a list of all sources for the chat and their current status. -#### `/upload_documents` +#### `/upload_file` Files can be uploaded here. This endpoint will process the document in a background and will extract information using the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using a LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). -#### `/load_confluence` +#### `/upload_source` -Loads all the content of a confluence space using the [document-extractor](#3-extractor-api-lib). -The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). +Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). +The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). +The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). An is configured. Defaults to 3600 seconds (1 hour). Can be adjusted by values in the helm chart. ### 2.3 Replaceable parts @@ -162,9 +163,9 @@ The extracted information will be summarized using LLM. The summary, as well as | information_enhancer | [`rag_core_lib.chains.async_chain.AsyncChain[Any, Any]`](./rag-core-lib/src/rag_core_lib/chains/async_chain.py)| [`rag_core_lib.impl.tracers.langfuse_traced_chain.LangfuseTracedGraph`](./rag-core-lib/src/rag_core_lib/impl/tracers/langfuse_traced_chain.py) |Wraps around the *untraced_information_enhancer* and adds langfuse tracing. | | document_deleter |[`admin_api_lib.api_endpoints.document_deleter.DocumentDeleter`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py) | [`admin_api_lib.impl.api_endpoints.default_document_deleter.DefaultDocumentDeleter`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py) | Handles deletion of sources. | | documents_status_retriever | [`admin_api_lib.api_endpoints.documents_status_retriever.DocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/documents_status_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_documents_status_retriever.DefaultDocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py) |Handles return of source status. | -| confluence_loader | [`admin_api_lib.api_endpoints.confluence_loader.ConfluenceLoader`](./admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py) | [`admin_api_lib.impl.api_endpoints.default_confluence_loader.DefaultConfluenceLoader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py)| Handles data loading and extraction from confluence. | +| source_uploader | [`admin_api_lib.api_endpoints.source_uploader.SourceUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_source_uploader.DefaultSourceUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py)| Handles data loading and extraction from various non-file sources. | | document_reference_retriever | [`admin_api_lib.api_endpoints.document_reference_retriever.DocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_reference_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_document_reference_retriever.DefaultDocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py) | Handles return of files from connected storage. | -| document_uploader | [`admin_api_lib.api_endpoints.document_uploader.DocumentUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_document_uploader.DefaultDocumentUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py) | Handles upload and extraction of files. | +| file_uploader | [`admin_api_lib.api_endpoints.file_uploader.FileUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_file_uploader.DefaultFileUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py) | Handles upload and extraction of files. | ## 3. Extractor API Lib @@ -175,7 +176,7 @@ This API should not be exposed by ingress and only used for internally. The following endpoints are provided by the *extractor-api-lib*: - `/extract_from_file`: This endpoint extracts the information from files. -- `/extract_from_confluence`: This endpoint extracts the information from a confluence space. +- `/extract_from_source`: This endpoint extracts the information from a non-file source. ### 3.1 Requirements @@ -202,12 +203,14 @@ The following types of information will be extracted: - `TEXT`: plain text - `TABLE`: data in tabular form found in the document -#### `/extract_from_confluence` +#### `/extract_from_source` -The extract from confluence endpoint will extract the information from a confluence space. -The following types of information will be extracted: +This endpoint will extract data for non-file source. +The type of information that is extracted will vary depending on the source, the following types of information can be extracted: - `TEXT`: plain text +- `TABLE`: data in tabular form found in the document +- `IMAGE`: image found in the document ### 3.3 Replaceable parts @@ -221,7 +224,8 @@ The following types of information will be extracted: | all_extractors | `dependency_injector.providers.List[extractor_api_lib.document_parser.information_extractor.InformationExtractor]` | `dependency_injector.providers.List(pdf_extractor, ms_docs_extractor, xml_extractor)` | List of all available extractors. If you add a new type of extractor you would have to add it to this list. | | general_extractor | [`extractor_api_lib.document_parser.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py) |[`extractor_api_lib.document_parser.general_extractor.GeneralExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/general_extractor.py) | Combines multiple extractors and decides which one to use for the given file format. | | file_extractor | [`extractor_api_lib.api_endpoints.file_extractor.FileExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_file_extractor.DefaultFileExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py) | Implementation of the `/extract_from_file` endpoint. Uses *general_extractor*. | -| confluence_extractor | [`extractor_api_lib.api_endpoints.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_confluence_extractor.DefaultConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py) | Implementation of the `/extract_from_confluence` endpoint. | +| general_source_extractor | [`extractor_api_lib.api_endpoints.source_extractor.SourceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py) | [`extractor_api_lib.impl.api_endpoints.general_source_extractor.GeneralSourceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py) | Implementation of the `/extract_from_source` endpoint. Will decide the correct extractor for the source. | +| confluence_extractor | [`extractor_api_lib.extractors.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py) | [`extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/confluence_extractor.py) | Implementation of an esxtractor for the source `confluence`. | ## 4. RAG Core Lib diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index c1b8afe..986f445 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,150 +1,378 @@ -openapi: 3.0.2 +openapi: 3.1.0 info: - description: The API is used for the communication between the admin frontend and - the admin backend in the rag project. - title: admin-api-lib - version: 1.0.0 + title: admin-api-lib + version: 1.0.0 + description: >- + The API is used for the communication between the admin frontend and the admin backend in the + rag project. servers: -- url: / + - + url: /api paths: - /document_reference/{identification}: - get: - operationId: document_reference_id_get - parameters: - - description: Identifier of the pdf document. - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - content: - application/pdf: - schema: - format: binary - type: string - description: Returns the pdf in binary form. - "400": - content: - application/json: - schema: - type: string - description: Bad request - "404": - content: - application/json: - schema: - type: string - description: Document not found. - "500": - content: - application/json: - schema: - type: string - description: Internal server error - tags: - - admin - /upload_documents: - post: - description: Uploads user selected pdf documents. - operationId: upload_documents_post - requestBody: - content: - application/pdf: - schema: - format: binary - type: string - description: The PDF document to upload. - required: true - responses: - "200": - description: ok - "400": - description: Bad request - "422": - description: If no text has been extracted from the file. - "500": - description: Internal server error - tags: - - admin - /delete_document/{identification}: - delete: - operationId: delete_document - parameters: - - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - description: Deleted - "500": - description: Internal server error - tags: - - admin - /all_documents_status: - get: - operationId: get_all_documents_status - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/document_status' - type: array - description: List of document links - "500": - description: Internal server error - tags: - - admin - /load_confluence: - post: - responses: - "200": - description: Loading from confluence is successful - "423": - description: "if the confluence loader is already processing a request,\ - \ no further requests are possible. The user needs to wait, till the preliminary\ - \ request finished processing." - "500": - description: Internal Server Error - "501": - description: The confluence loader is not set up - tags: - - admin - summary: Loading confluence to the vector db + '/delete_document/{identification}': + delete: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + schema: + title: Identification + description: '' + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Deleted + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + description: Internal server error + operationId: delete_document + summary: Delete Document + description: |- + Asynchronously deletes a document based on the provided identification. + + Parameters + ---------- + identification : str + The unique identifier of the document to be deleted. + + Returns + ------- + None + '/document_reference/{identification}': + get: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + description: Identifier of the document. + schema: + title: Identification + description: Identifier of the document. + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: + format: binary + title: Response 200 Document Reference Document Reference Identification Get + type: string + description: Returns the pdf in binary form. + '400': + content: + application/json: + schema: + title: Response 400 Document Reference Document Reference Identification Get + type: string + description: Bad request + '404': + content: + application/json: + schema: + title: Response 404 Document Reference Document Reference Identification Get + type: string + description: Document not found. + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + content: + application/json: + schema: + title: Response 500 Document Reference Document Reference Identification Get + type: string + description: Internal server error + operationId: document_reference + summary: Document Reference Id Get + description: |- + Asynchronously retrieve a document reference by its identification. + + Parameters + ---------- + identification : str + The unique identifier for the document reference. + + Returns + ------- + Response + The response object containing the document reference details. + /all_documents_status: + get: + tags: + - admin + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/DocumentStatus' + description: List of document links + '500': + description: Internal server error + operationId: get_all_documents_status + summary: Get All Documents Status + description: |- + Asynchronously retrieves the status of all documents. + + Returns + ------- + list[DocumentStatus] + A list containing the status of all documents. + /upload_file: + post: + requestBody: + content: + multipart/form-data: + schema: + $ref: '#/components/schemas/Body_upload_file_upload_file_post' + required: true + tags: + - admin + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_file + summary: Upload File + description: Uploads user selected sources. + /upload_source: + post: + requestBody: + content: + application/json: + schema: + description: '' + type: array + items: + $ref: '#/components/schemas/KeyValuePair' + tags: + - admin + parameters: + - + style: form + explode: true + name: source_type + schema: + title: Type + description: '' + type: string + in: query + required: false + - + style: form + explode: true + name: name + schema: + title: Name + description: '' + type: string + in: query + required: false + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_source + summary: Upload Source + description: Uploads user selected sources. components: - schemas: - status: - description: "" - enum: - - UPLOADING - - PROCESSING - - READY - - ERROR - title: status - type: string - document_status: - description: "" - example: - name: name - status: UPLOADING - properties: - name: - description: "" - title: name - type: string - status: - $ref: '#/components/schemas/status' - required: - - name - - status - title: document_status - type: object + schemas: + Body_upload_file_upload_file_post: + title: Body_upload_file_upload_file_post + required: + - file + properties: + file: + format: binary + title: File + type: string + DocumentStatus: + title: DocumentStatus + description: DocumentStatus + required: + - name + - status + properties: + name: + title: Name + type: string + status: + $ref: '#/components/schemas/Status' + example: + name: name + status: UPLOADING + HTTPValidationError: + title: HTTPValidationError + description: HTTPValidationError + properties: + detail: + nullable: true + title: detail + type: array + items: + $ref: '#/components/schemas/ValidationError' + example: + detail: + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + KeyValuePair: + title: KeyValuePair + description: KeyValuePair + required: + - key + - value + properties: + key: + title: Key + type: string + value: + title: Value + type: string + example: + value: value + key: key + Status: + title: Status + description: allowed enum values + enum: + - UPLOADING + - PROCESSING + - READY + - ERROR + type: string + ValidationError: + title: ValidationError + description: ValidationError + required: + - loc + - msg + - type + properties: + loc: + title: loc + type: array + items: + $ref: '#/components/schemas/ValidationErrorLocInner' + msg: + title: Msg + type: string + type: + title: Type + type: string + example: + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + ValidationErrorLocInner: + title: ValidationErrorLocInner + description: ValidationErrorLocInner + properties: + anyof_schema_1_validator: + nullable: true + title: anyof_schema_1_validator + type: string + anyof_schema_2_validator: + nullable: true + title: anyof_schema_2_validator + type: integer + actual_instance: + title: actual_instance + any_of_schemas: + title: any_of_schemas + type: array + items: + type: string + example: + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 diff --git a/admin-api-lib/poetry.lock b/admin-api-lib/poetry.lock index 671adcc..223c2a5 100644 --- a/admin-api-lib/poetry.lock +++ b/admin-api-lib/poetry.lock @@ -3693,4 +3693,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" +content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" \ No newline at end of file diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index ec0de57..2668032 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -29,7 +29,7 @@ per-file-ignores = """ ./src/admin_api_lib/impl/admin_api.py: B008, ./src/admin_api_lib/dependency_container.py: CCE002,CCE001, ./src/admin_api_lib/apis/admin_api_base.py: WOT001, - ./tests/*: S101,S106,D100,D103,PT011 + ./tests/*: S101,S106,D100,D103,PT011,N802 ./src/admin_api_lib/impl/settings/confluence_settings.py: C901,N805, ./src/admin_api_lib/impl/utils/comma_separated_bool_list.py: R505, ./src/admin_api_lib/impl/utils/comma_separated_str_list.py: R505, diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py deleted file mode 100644 index 06d79be..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Module for ConfluenceLoader abstract base class.""" - -from abc import ABC, abstractmethod - - -class ConfluenceLoader(ABC): - """Abstract base class for the confluence loader endpoint.""" - - @abstractmethod - async def aload_from_confluence(self) -> None: - """ - Load data from Confluence asynchronously. - - This method should be implemented to load data asynchronously from Confluence. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py index 155baf0..3f222bc 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py @@ -7,7 +7,7 @@ class DocumentDeleter(ABC): """Abstract base class for document deletion endpoint.""" @abstractmethod - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Delete a document by its identification asynchronously. @@ -15,6 +15,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py deleted file mode 100644 index 9a3e70b..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the DocumentUploader abstract base class.""" - -from abc import ABC, abstractmethod - -from fastapi import Request, UploadFile - - -class DocumentUploader(ABC): - """Abstract base class for document upload endpoint.""" - - @abstractmethod - async def aupload_documents_post(self, body: UploadFile, request: Request) -> None: - """ - Upload documents asynchronously, currently supported formats are: PDF, DOCX, XML, PPTX. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py new file mode 100644 index 0000000..3ab7464 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -0,0 +1,31 @@ +"""Module for the upload file endpoint.""" + +from abc import abstractmethod + +from fastapi import UploadFile + +from admin_api_lib.api_endpoints.uploader_base import UploaderBase + + +class FileUploader(UploaderBase): + + @abstractmethod + async def upload_file( + self, + base_url: str, + file: UploadFile, + ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py new file mode 100644 index 0000000..5a1c50a --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -0,0 +1,40 @@ +"""Module for the upload source endpoint.""" + +from abc import abstractmethod +from typing import Optional + +from pydantic import StrictStr + +from admin_api_lib.api_endpoints.uploader_base import UploaderBase +from admin_api_lib.models.key_value_pair import KeyValuePair + + +class SourceUploader(UploaderBase): + """Abstract base class for source uploader API endpoints.""" + + @abstractmethod + async def upload_source( + self, + source_type: StrictStr, + name: StrictStr, + kwargs: list[KeyValuePair], + timeout: Optional[float], + ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation. + + Returns + ------- + None + """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py new file mode 100644 index 0000000..a344dcc --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py @@ -0,0 +1,30 @@ +"""Module for the base class of uploader API endpoints.""" + +from threading import Thread + + +class UploaderBase: + """Base class for uploader API endpoints.""" + + def __init__(self): + """ + Initialize the UploaderBase. + """ + self._background_threads = [] + + def _prune_background_threads(self) -> list[Thread]: + """ + Prune background threads that are no longer running. + + Returns + ------- + list[Thread] + A list of background threads that are still alive. + """ + tmp_background_threads = [] + for thread in self._background_threads: + if not thread.is_alive(): + thread.join() + else: + tmp_background_threads.append(thread) + self._background_threads = tmp_background_threads diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 16efc4b..c348b5d 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -2,15 +2,37 @@ # coding: utf-8 +from typing import Dict, List # noqa: F401 import importlib import pkgutil - -from fastapi import APIRouter, Path, Request, Response, UploadFile # noqa: F401 +from typing_extensions import Annotated import admin_api_lib.impl + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + UploadFile, + Request, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) +from pydantic import Field, StrictStr + + from admin_api_lib.apis.admin_api_base import BaseAdminApi from admin_api_lib.models.document_status import DocumentStatus - +from admin_api_lib.models.http_validation_error import HTTPValidationError +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.models.extra_models import TokenModel # noqa: F401 router = APIRouter() @@ -24,12 +46,14 @@ responses={ 200: {"description": "Deleted"}, 500: {"description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Delete Document", response_model_by_alias=True, ) async def delete_document( - identification: str = Path(..., description=""), + identification: StrictStr = Path(..., description=""), ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -43,6 +67,8 @@ async def delete_document( ------- None """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().delete_document(identification) @@ -53,12 +79,16 @@ async def delete_document( 400: {"model": str, "description": "Bad request"}, 404: {"model": str, "description": "Document not found."}, 500: {"model": str, "description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Document Reference Id Get", response_model_by_alias=True, ) -async def document_reference_id_get( - identification: str = Path(..., description="Identifier of the pdf document."), +async def document_reference( + identification: Annotated[StrictStr, Field(description="Identifier of the document.")] = Path( + ..., description="Identifier of the document." + ), ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -73,19 +103,22 @@ async def document_reference_id_get( Response The response object containing the document reference details. """ - return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().document_reference(identification) @router.get( "/all_documents_status", responses={ - 200: {"model": list[DocumentStatus], "description": "list of document links"}, + 200: {"model": List[DocumentStatus], "description": "List of document links"}, 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Get All Documents Status", response_model_by_alias=True, ) -async def get_all_documents_status() -> list[DocumentStatus]: +async def get_all_documents_status() -> List[DocumentStatus]: """ Asynchronously retrieves the status of all documents. @@ -94,64 +127,71 @@ async def get_all_documents_status() -> list[DocumentStatus]: list[DocumentStatus] A list containing the status of all documents. """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() @router.post( - "/load_confluence", + "/upload_file", responses={ - 200: {"description": "Loading from confluence is successful"}, - 423: { - "description": ( - "if the confluence loader is already processing a request," - "no further requests are possible. The user needs to wait," - "till the preliminary request finished processing." - ) - }, - 500: {"description": "Internal Server Error"}, - 501: {"description": "The confluence loader is not set up"}, + 200: {"description": "ok"}, + 400: {"description": "Bad request"}, + 422: {"description": "Unprocessable Content"}, + 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Upload File", response_model_by_alias=True, ) -async def load_confluence_post() -> None: +async def upload_file( + file: UploadFile, + request: Request, +) -> None: """ - Asynchronously loads a Confluence space. + Uploads user selected sources. - Returns - ------- - None + Parameters + ---------- + file : UploadFile + The file to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. """ - return await BaseAdminApi.subclasses[0]().load_confluence_post() + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_file(file, request) @router.post( - "/upload_documents", + "/upload_source", responses={ 200: {"description": "ok"}, 400: {"description": "Bad request"}, - 422: {"description": "If no text has been extracted from the file."}, + 422: {"description": "Unprocessable Content"}, 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Upload Source", response_model_by_alias=True, ) -async def upload_documents_post( - body: UploadFile, - request: Request, +async def upload_source( + source_type: StrictStr = Query(None, description="The type of the source"), + name: StrictStr = Query(None, description="The name of the source", alias="name"), + key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"), ) -> None: """ - Asynchronously uploads user-selected source documents. + Uploads user selected sources. Parameters ---------- - body : UploadFile - The file object containing the source documents to be uploaded. - request : Request - The request object containing metadata about the upload request. - - Returns - ------- - None + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : List[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. """ - return await BaseAdminApi.subclasses[0]().upload_documents_post(body, request) + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 6d12beb..e3841b9 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -3,11 +3,14 @@ # coding: utf-8 # flake8: noqa: D105 -from typing import ClassVar, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 +from typing_extensions import Annotated +from pydantic import Field, StrictStr from fastapi import Request, Response, UploadFile from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.key_value_pair import KeyValuePair class BaseAdminApi: @@ -28,7 +31,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, - identification: str, + identification: StrictStr, ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -43,9 +46,9 @@ async def delete_document( None """ - async def document_reference_id_get( + async def document_reference( self, - identification: str, + identification: Annotated[StrictStr, Field(description="Identifier of the document.")], ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -73,28 +76,40 @@ async def get_all_documents_status( A list containing the status of all documents. """ - async def load_confluence_post( + async def upload_source( self, + source_type: StrictStr, + name: StrictStr, + key_value_pair: List[KeyValuePair], ) -> None: """ - Asynchronously loads a Confluence space. + Asynchronously uploads user selected source. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. Returns ------- None """ - async def upload_documents_post( + async def upload_file( self, - body: UploadFile, + file: UploadFile, request: Request, ) -> None: """ - Asynchronously uploads user-selected source documents. + Asynchronously uploads user-selected documents. Parameters ---------- - body : UploadFile + file : UploadFile The file object containing the source documents to be uploaded. request : Request The request object containing metadata about the upload request. diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 9079a47..fd5e0a1 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,5 +1,6 @@ """Module for the DependencyContainer class.""" +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -18,18 +19,14 @@ from admin_api_lib.extractor_api_client.openapi_client.configuration import ( Configuration as ExtractorConfiguration, ) -from admin_api_lib.impl.api_endpoints.default_confluence_loader import ( - DefaultConfluenceLoader, -) +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.impl.api_endpoints.default_document_deleter import ( DefaultDocumentDeleter, ) from admin_api_lib.impl.api_endpoints.default_document_reference_retriever import ( DefaultDocumentReferenceRetriever, ) -from admin_api_lib.impl.api_endpoints.default_document_uploader import ( - DefaultDocumentUploader, -) + from admin_api_lib.impl.api_endpoints.default_documents_status_retriever import ( DefaultDocumentsStatusRetriever, ) @@ -42,20 +39,17 @@ from admin_api_lib.impl.key_db.file_status_key_value_store import ( FileStatusKeyValueStore, ) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) from admin_api_lib.impl.mapper.informationpiece2document import ( InformationPiece2Document, ) from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings from admin_api_lib.impl.settings.document_extractor_settings import ( DocumentExtractorSettings, ) from admin_api_lib.impl.settings.key_value_settings import KeyValueSettings from admin_api_lib.impl.settings.rag_api_settings import RAGAPISettings from admin_api_lib.impl.settings.s3_settings import S3Settings +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings from admin_api_lib.impl.settings.summarizer_settings import SummarizerSettings from admin_api_lib.impl.summarizer.langchain_summarizer import LangchainSummarizer from admin_api_lib.prompt_templates.summarize_prompt import SUMMARIZE_PROMPT @@ -92,7 +86,7 @@ class DependencyContainer(DeclarativeContainer): rag_api_settings = RAGAPISettings() key_value_store_settings = KeyValueSettings() summarizer_settings = SummarizerSettings() - confluence_settings = ConfluenceSettings() + source_uploader_settings = SourceUploaderSettings() key_value_store = Singleton(FileStatusKeyValueStore, key_value_store_settings) file_service = Singleton(S3Service, s3_settings=s3_settings) @@ -111,7 +105,6 @@ class DependencyContainer(DeclarativeContainer): rag_api = Singleton(RagApi, rag_api_client) information_mapper = Singleton(InformationPiece2Document) - confluence_settings_mapper = Singleton(ConfluenceSettingsMapper) large_language_model = Selector( class_selector_config.llm_type, @@ -164,27 +157,29 @@ class DependencyContainer(DeclarativeContainer): DefaultDocumentDeleter, rag_api=rag_api, file_service=file_service, key_value_store=key_value_store ) documents_status_retriever = Singleton(DefaultDocumentsStatusRetriever, key_value_store=key_value_store) - confluence_loader = Singleton( - DefaultConfluenceLoader, + + document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) + + source_uploader = Singleton( + DefaultSourceUploader, extractor_api=document_extractor, rag_api=rag_api, - key_value_store=key_value_store, - settings=confluence_settings, information_enhancer=information_enhancer, information_mapper=information_mapper, chunker=chunker, + key_value_store=key_value_store, document_deleter=document_deleter, - settings_mapper=confluence_settings_mapper, + settings=source_uploader_settings, ) - document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) - document_uploader = Singleton( - DefaultDocumentUploader, - document_extractor=document_extractor, - file_service=file_service, + + file_uploader = Singleton( + DefaultFileUploader, + extractor_api=document_extractor, rag_api=rag_api, information_enhancer=information_enhancer, information_mapper=information_mapper, chunker=chunker, key_value_store=key_value_store, document_deleter=document_deleter, + file_service=file_service, ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py index 79a89e3..edf9fd4 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -17,38 +17,22 @@ __version__ = "1.0.0" # import apis into sdk package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi # import ApiClient from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiAttributeError, - ApiException, - ApiKeyError, - ApiTypeError, - ApiValueError, - OpenApiException, -) +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException # import models into sdk package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py index 13a312f..c95ce65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -1,6 +1,4 @@ # flake8: noqa # import apis into api package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py index f1fddba..1aaddf7 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -12,25 +12,17 @@ """ # noqa: E501 import warnings +from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt from typing import Any, Dict, List, Optional, Tuple, Union - -from pydantic import Field, StrictFloat, StrictInt, StrictStr, validate_call from typing_extensions import Annotated -from admin_api_lib.extractor_api_client.openapi_client.api_client import ( - ApiClient, - RequestSerialized, -) +from typing import List +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType @@ -47,9 +39,9 @@ def __init__(self, api_client=None) -> None: self.api_client = api_client @validate_call - def extract_from_confluence_post( + def extract_from_file_post( self, - confluence_parameters: ConfluenceParameters, + extraction_request: ExtractionRequest, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -60,11 +52,11 @@ def extract_from_confluence_post( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> List[InformationPiece]: - """extract_from_confluence_post + """extract_from_file_post - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters + :param extraction_request: (required) + :type extraction_request: ExtractionRequest :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -87,8 +79,8 @@ def extract_from_confluence_post( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -97,7 +89,6 @@ def extract_from_confluence_post( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", - "404": None, "422": None, "500": None, } @@ -109,9 +100,9 @@ def extract_from_confluence_post( ).data @validate_call - def extract_from_confluence_post_with_http_info( + def extract_from_file_post_with_http_info( self, - confluence_parameters: ConfluenceParameters, + extraction_request: ExtractionRequest, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -122,11 +113,11 @@ def extract_from_confluence_post_with_http_info( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> ApiResponse[List[InformationPiece]]: - """extract_from_confluence_post + """extract_from_file_post - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters + :param extraction_request: (required) + :type extraction_request: ExtractionRequest :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -149,8 +140,8 @@ def extract_from_confluence_post_with_http_info( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -159,7 +150,6 @@ def extract_from_confluence_post_with_http_info( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", - "404": None, "422": None, "500": None, } @@ -171,9 +161,9 @@ def extract_from_confluence_post_with_http_info( ) @validate_call - def extract_from_confluence_post_without_preload_content( + def extract_from_file_post_without_preload_content( self, - confluence_parameters: ConfluenceParameters, + extraction_request: ExtractionRequest, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -184,11 +174,11 @@ def extract_from_confluence_post_without_preload_content( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> RESTResponseType: - """extract_from_confluence_post + """extract_from_file_post - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters + :param extraction_request: (required) + :type extraction_request: ExtractionRequest :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -211,8 +201,8 @@ def extract_from_confluence_post_without_preload_content( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -221,21 +211,21 @@ def extract_from_confluence_post_without_preload_content( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", - "404": None, "422": None, "500": None, } response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) return response_data.response - def _extract_from_confluence_post_serialize( + def _extract_from_file_post_serialize( self, - confluence_parameters, + extraction_request, _request_auth, _content_type, _headers, _host_index, ) -> RequestSerialized: + _host = None _collection_formats: Dict[str, str] = {} @@ -252,8 +242,8 @@ def _extract_from_confluence_post_serialize( # process the header parameters # process the form parameters # process the body parameter - if confluence_parameters is not None: - _body_params = confluence_parameters + if extraction_request is not None: + _body_params = extraction_request # set the HTTP header `Accept` if "Accept" not in _header_params: @@ -272,7 +262,7 @@ def _extract_from_confluence_post_serialize( return self.api_client.param_serialize( method="POST", - resource_path="/extract_from_confluence", + resource_path="/extract_from_file", path_params=_path_params, query_params=_query_params, header_params=_header_params, @@ -286,9 +276,9 @@ def _extract_from_confluence_post_serialize( ) @validate_call - def extract_from_file_post( + def extract_from_source( self, - extraction_request: ExtractionRequest, + extraction_parameters: ExtractionParameters, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -299,11 +289,11 @@ def extract_from_file_post( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> List[InformationPiece]: - """extract_from_file_post + """extract_from_source - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -326,8 +316,8 @@ def extract_from_file_post( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -336,6 +326,7 @@ def extract_from_file_post( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", + "404": None, "422": None, "500": None, } @@ -347,9 +338,9 @@ def extract_from_file_post( ).data @validate_call - def extract_from_file_post_with_http_info( + def extract_from_source_with_http_info( self, - extraction_request: ExtractionRequest, + extraction_parameters: ExtractionParameters, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -360,11 +351,11 @@ def extract_from_file_post_with_http_info( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> ApiResponse[List[InformationPiece]]: - """extract_from_file_post + """extract_from_source - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -387,8 +378,8 @@ def extract_from_file_post_with_http_info( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -397,6 +388,7 @@ def extract_from_file_post_with_http_info( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", + "404": None, "422": None, "500": None, } @@ -408,9 +400,9 @@ def extract_from_file_post_with_http_info( ) @validate_call - def extract_from_file_post_without_preload_content( + def extract_from_source_without_preload_content( self, - extraction_request: ExtractionRequest, + extraction_parameters: ExtractionParameters, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], @@ -421,11 +413,11 @@ def extract_from_file_post_without_preload_content( _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> RESTResponseType: - """extract_from_file_post + """extract_from_source - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -448,8 +440,8 @@ def extract_from_file_post_without_preload_content( :return: Returns the result object. """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, @@ -458,20 +450,22 @@ def extract_from_file_post_without_preload_content( _response_types_map: Dict[str, Optional[str]] = { "200": "List[InformationPiece]", + "404": None, "422": None, "500": None, } response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) return response_data.response - def _extract_from_file_post_serialize( + def _extract_from_source_serialize( self, - extraction_request, + extraction_parameters, _request_auth, _content_type, _headers, _host_index, ) -> RequestSerialized: + _host = None _collection_formats: Dict[str, str] = {} @@ -488,8 +482,8 @@ def _extract_from_file_post_serialize( # process the header parameters # process the form parameters # process the body parameter - if extraction_request is not None: - _body_params = extraction_request + if extraction_parameters is not None: + _body_params = extraction_parameters # set the HTTP header `Accept` if "Accept" not in _header_params: @@ -508,7 +502,7 @@ def _extract_from_file_post_serialize( return self.api_client.param_serialize( method="POST", - resource_path="/extract_from_file", + resource_path="/extract_from_source", path_params=_path_params, query_params=_query_params, header_params=_header_params, diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py index 911fd0d..ba8f5d2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -13,36 +13,31 @@ import datetime +from dateutil.parser import parse +from enum import Enum import decimal import json import mimetypes import os import re import tempfile -from enum import Enum -from typing import Dict, List, Optional, Tuple, Union -from urllib.parse import quote -from dateutil.parser import parse +from urllib.parse import quote +from typing import Tuple, Optional, List, Dict, Union from pydantic import SecretStr +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT import admin_api_lib.extractor_api_client.openapi_client.models from admin_api_lib.extractor_api_client.openapi_client import rest -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.api_response import ( - T as ApiResponseT, -) -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, ApiValueError, + ApiException, BadRequestException, + UnauthorizedException, ForbiddenException, NotFoundException, ServiceException, - UnauthorizedException, ) RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py index ca801da..1ce1372 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -1,10 +1,8 @@ """API response object.""" from __future__ import annotations - -from typing import Generic, Mapping, Optional, TypeVar - -from pydantic import BaseModel, Field, StrictBytes, StrictInt +from typing import Optional, Generic, Mapping, TypeVar +from pydantic import Field, StrictInt, StrictBytes, BaseModel T = TypeVar("T") diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py index de102b2..2e80369 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -13,15 +13,15 @@ import copy -import http.client as httplib import logging +from logging import FileHandler import multiprocessing import sys -from logging import FileHandler from typing import Optional - import urllib3 +import http.client as httplib + JSON_SCHEMA_VALIDATION_KEYWORDS = { "multipleOf", "maximum", diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py index 877d8be..5dbd4b0 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -12,7 +12,6 @@ """ # noqa: E501 from typing import Any, Optional - from typing_extensions import Self @@ -103,6 +102,7 @@ def __init__(self, msg, path_to_item=None) -> None: class ApiException(OpenApiException): + def __init__( self, status=None, diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py index 4301aed..ad02f00 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -14,18 +14,8 @@ # import models into model package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py deleted file mode 100644 index e24f0ad..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations - -import json -import pprint -import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set - -from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictStr -from typing import Any, ClassVar, Dict, List, Optional -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair -from typing import Optional, Set -from typing_extensions import Self - - -class ConfluenceParameters(BaseModel): - """ """ # noqa: E501 - - url: StrictStr = Field(description="url of the confluence space.") - token: StrictStr = Field(description="api key to access confluence.") - space_key: StrictStr = Field(description="the space key of the confluence pages.") - include_attachments: Optional[StrictBool] = Field( - default=False, - description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.", - ) - keep_markdown_format: Optional[StrictBool] = Field( - default=True, description="whether to preserve markdown formatting in the output. Default is `true`." - ) - keep_newlines: Optional[StrictBool] = Field( - default=True, - description="whether to retain newline characters in the output for better readability. Default is `true`.", - ) - document_name: StrictStr = Field( - description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." - ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field( - default=None, description="Additional kwargs like verify_ssl" - ) - __properties: ClassVar[List[str]] = [ - "url", - "token", - "space_key", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - "document_name", - "confluence_kwargs", - ] - - model_config = ConfigDict( - populate_by_name=True, - validate_assignment=True, - protected_namespaces=(), - ) - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) - - @classmethod - def from_json(cls, json_str: str) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - excluded_fields: Set[str] = set([]) - - _dict = self.model_dump( - by_alias=True, - exclude=excluded_fields, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) - _items = [] - if self.confluence_kwargs: - for _item_confluence_kwargs in self.confluence_kwargs: - if _item_confluence_kwargs: - _items.append(_item_confluence_kwargs.to_dict()) - _dict["confluence_kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "url": obj.get("url"), - "token": obj.get("token"), - "space_key": obj.get("space_key"), - "include_attachments": ( - obj.get("include_attachments") if obj.get("include_attachments") is not None else False - ), - "keep_markdown_format": ( - obj.get("keep_markdown_format") if obj.get("keep_markdown_format") is not None else True - ), - "keep_newlines": obj.get("keep_newlines") if obj.get("keep_newlines") is not None else True, - "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] - if obj.get("confluence_kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py index c659e69..cd0f9c7 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py @@ -13,10 +13,8 @@ from __future__ import annotations - import json from enum import Enum - from typing_extensions import Self diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py new file mode 100644 index 0000000..13ba2ea --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -0,0 +1,103 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set +from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item_kwargs in self.kwargs: + if _item_kwargs: + _items.append(_item_kwargs.to_dict()) + _dict["kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] + if obj.get("kwargs") is not None + else None + ), + "source_type": obj.get("source_type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py index 393ba17..8bcfb3c 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -13,13 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from typing import Optional, Set from typing_extensions import Self @@ -27,7 +27,8 @@ class ExtractionRequest(BaseModel): """ """ # noqa: E501 path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] model_config = ConfigDict( populate_by_name=True, @@ -41,7 +42,8 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -76,5 +78,5 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index a6d6c08..a428183 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -13,22 +13,17 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set from typing_extensions import Self -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) - class InformationPiece(BaseModel): """ @@ -52,7 +47,8 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py index 80629a9..2a77b65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py @@ -13,13 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from typing import Optional, Set from typing_extensions import Self @@ -42,7 +42,8 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py index 09f1e39..60fc660 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -19,10 +19,7 @@ import urllib3 -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, - ApiValueError, -) +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} RESTResponseType = urllib3.HTTPResponse @@ -39,6 +36,7 @@ def is_socks_proxy_url(url): class RESTResponse(io.IOBase): + def __init__(self, resp) -> None: self.response = resp self.status = resp.status @@ -60,6 +58,7 @@ def getheader(self, name, default=None): class RESTClientObject: + def __init__(self, configuration) -> None: # urllib3.PoolManager will pass all kw parameters to connectionpool # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 @@ -144,6 +143,7 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: + # no content type provided or payload is json content_type = headers.get("Content-Type") if not content_type or re.search("json", content_type, re.IGNORECASE): diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py new file mode 100644 index 0000000..5a78d9b --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -0,0 +1,35 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + + +class TestContentType(unittest.TestCase): + """ContentType unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testContentType(self): + """Test ContentType""" + # inst = ContentType() + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py new file mode 100644 index 0000000..9504ab4 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters + + +class TestExtractionParameters(unittest.TestCase): + """ExtractionParameters unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionParameters: + """Test ExtractionParameters + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionParameters` + """ + model = ExtractionParameters() + if include_optional: + return ExtractionParameters( + document_name = '', + confluence_kwargs = [ + {"value":"value","key":"key"} + ], + type = '' + ) + else: + return ExtractionParameters( + document_name = '', + type = '', + ) + """ + + def testExtractionParameters(self): + """Test ExtractionParameters""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py new file mode 100644 index 0000000..1401561 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + + +class TestExtractionRequest(unittest.TestCase): + """ExtractionRequest unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionRequest: + """Test ExtractionRequest + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionRequest` + """ + model = ExtractionRequest() + if include_optional: + return ExtractionRequest( + path_on_s3 = '', + document_name = '' + ) + else: + return ExtractionRequest( + path_on_s3 = '', + document_name = '', + ) + """ + + def testExtractionRequest(self): + """Test ExtractionRequest""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py new file mode 100644 index 0000000..975a7bf --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -0,0 +1,39 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + + +class TestExtractorApi(unittest.TestCase): + """ExtractorApi unit test stubs""" + + def setUp(self) -> None: + self.api = ExtractorApi() + + def tearDown(self) -> None: + pass + + def test_extract_from_file_post(self) -> None: + """Test case for extract_from_file_post""" + pass + + def test_extract_from_source(self) -> None: + """Test case for extract_from_source""" + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py new file mode 100644 index 0000000..479c858 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -0,0 +1,62 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + + +class TestInformationPiece(unittest.TestCase): + """InformationPiece unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> InformationPiece: + """Test InformationPiece + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `InformationPiece` + """ + model = InformationPiece() + if include_optional: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE' + ) + else: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE', + ) + """ + + def testInformationPiece(self): + """Test InformationPiece""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py new file mode 100644 index 0000000..0ddc864 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -0,0 +1,54 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + + +class TestKeyValuePair(unittest.TestCase): + """KeyValuePair unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> KeyValuePair: + """Test KeyValuePair + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `KeyValuePair` + """ + model = KeyValuePair() + if include_optional: + return KeyValuePair( + key = None, + value = None + ) + else: + return KeyValuePair( + ) + """ + + def testKeyValuePair(self): + """Test KeyValuePair""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 9c24eba..4ecdd4c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -2,15 +2,18 @@ import logging + +from pydantic import StrictStr from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter from admin_api_lib.api_endpoints.document_reference_retriever import ( DocumentReferenceRetriever, ) -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader from admin_api_lib.api_endpoints.documents_status_retriever import ( DocumentsStatusRetriever, ) @@ -85,24 +88,57 @@ async def get_all_documents_status( return await document_status_retriever.aget_all_documents_status() @inject - async def load_confluence_post( + async def upload_source( + self, + source_type: StrictStr, + name: StrictStr, + kwargs: list[KeyValuePair], + source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), + ) -> None: + """ + Asynchronously uploads user-selected source documents. + + Parameters + ---------- + source_type : StrictStr + The type of the source document to be uploaded. + name : StrictStr + The name of the source document to be uploaded. + kwargs : list[KeyValuePair] + Additional parameters required for the extractor. + source_uploader : SourceUploader + An instance of SourceUploader to handle the upload process. + + Returns + ------- + None + """ + await source_uploader.upload_source(source_type, name, kwargs) + + @inject + async def upload_file( self, - confluence_loader: ConfluenceLoader = Depends(Provide[DependencyContainer.confluence_loader]), + file: UploadFile, + request: Request, + file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), ) -> None: """ - Asynchronously loads a Confluence space using the provided ConfluenceLoader. + Asynchronously uploads a file to the server. Parameters ---------- - confluence_loader : ConfluenceLoader - The ConfluenceLoader instance to use for loading the post. This is injected by dependency injection - (default is Depends(Provide[DependencyContainer.confluence_loader])). + file : UploadFile + The file object to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. + file_uploader : FileUploader, optional + An instance of FileUploader to handle the upload process. Returns ------- None """ - await confluence_loader.aload_from_confluence() + await file_uploader.upload_file(str(request.base_url), file) @inject async def document_reference_id_get( @@ -129,28 +165,3 @@ async def document_reference_id_get( The document in binary form. """ return await document_reference_retriever.adocument_reference_id_get(identification) - - @inject - async def upload_documents_post( - self, - body: UploadFile, - request: Request, - document_uploader: DocumentUploader = Depends(Provide[DependencyContainer.document_uploader]), - ) -> None: - """ - Handle the POST request to upload documents. - - Parameters - ---------- - body : UploadFile - The file to be uploaded. - request : Request - The request object containing metadata about the request. - document_uploader : DocumentUploader, optional - The document uploader dependency, by default provided by DependencyContainer. - - Returns - ------- - None - """ - await document_uploader.aupload_documents_post(body, request) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py deleted file mode 100644 index 54fcfda..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Module for the DefaultConfluenceLoader class.""" - -import logging -from asyncio import run -from threading import Thread -import threading - -from fastapi import HTTPException, status -from langchain_core.documents import Document - -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultConfluenceLoader(ConfluenceLoader): - """ - DefaultConfluenceLoader is responsible for loading content from Confluence asynchronously. - - Attributes - ---------- - CONFLUENCE_SPACE : str - The Confluence space key. - """ - - CONFLUENCE_SPACE = "confluence_space" - - def __init__( - self, - extractor_api: ExtractorApi, - settings: ConfluenceSettings, - information_mapper: InformationPiece2Document, - rag_api: RagApi, - key_value_store: FileStatusKeyValueStore, - information_enhancer: InformationEnhancer, - chunker: Chunker, - document_deleter: DocumentDeleter, - settings_mapper: ConfluenceSettingsMapper, - ): - """ - Initialize the DefaultConfluenceLoader with the provided dependencies. - - Parameters - ---------- - extractor_api : ExtractorApi - The API for extracting information. - settings : ConfluenceSettings - The settings for Confluence. - information_mapper : InformationPiece2Document - The mapper for information pieces to langchain documents. - rag_api : RagApi - The API client for interacting with the RAG backend system. - key_value_store : FileStatusKeyValueStore - The key-value store to store file names and the corresponding file statuses. - information_enhancer : InformationEnhancer - The enhancer for information pieces. - chunker : Chunker - The chunker for breaking down documents into chunks. - document_deleter : DocumentDeleter - The deleter for documents from S3 Storage and Vector Database. - settings_mapper : ConfluenceSettingsMapper - The mapper to map the Confluence settings to confluence parameters. - """ - self._extractor_api = extractor_api - self._rag_api = rag_api - self._settings = settings - self._key_value_store = key_value_store - self._information_mapper = information_mapper - self._information_enhancer = information_enhancer - self._chunker = chunker - self._document_deleter = document_deleter - self._settings_mapper = settings_mapper - self._background_thread = None - self._document_key = None - - async def aload_from_confluence(self) -> None: - """ - Asynchronously loads content from Confluence using the configured settings. - - Raises - ------ - HTTPException - If the Confluence loader is not configured or if a load is already in progress. - """ - for index in range(len(self._settings.url)): - if not ( - self._settings.url[index].strip() - and self._settings.space_key[index].strip() - and self._settings.token[index].strip() - ): - raise HTTPException( - status.HTTP_501_NOT_IMPLEMENTED, - "The confluence loader is not configured! Required fields are missing.", - ) - - if self._background_thread is not None and self._background_thread.is_alive(): - raise HTTPException( - status.HTTP_423_LOCKED, "Confluence loader is locked... Please wait for the current load to finish." - ) - self._background_thread = Thread(target=lambda: run(self._aload_from_confluence())) - self._background_thread.start() - - async def _aload_from_confluence(self) -> None: - async def process_confluence(index): - logger.info("Loading from Confluence %s", self._settings.url[index]) - self._sanitize_document_name(index=index) - - params = self._settings_mapper.map_settings_to_params(self._settings, index) - try: - self._key_value_store.upsert(self._settings.document_name[index], Status.PROCESSING) - information_pieces = self._extractor_api.extract_from_confluence_post(params) - documents = [ - self._information_mapper.extractor_information_piece2document(x) for x in information_pieces - ] - documents = await self._aenhance_langchain_documents(documents) - chunked_documents = self._chunker.chunk(documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents - ] - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - - logger.error("Error while loading from Confluence: %s", str(e)) - raise HTTPException( - status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}" - ) from e - - await self._delete_previous_information_pieces(index=index) - self._key_value_store.upsert(self._settings.document_name[index], Status.UPLOADING) - self._upload_information_pieces(rag_information_pieces, index=index) - - threads = [] - for idx in range(len(self._settings.url)): - t = threading.Thread(target=lambda idx=idx: run(process_confluence(idx))) - threads.append(t) - t.start() - for t in threads: - t.join() - - async def _aenhance_langchain_documents(self, documents: list[Document]): - try: - return await self._information_enhancer.ainvoke(documents) - except Exception as e: - logger.error("Exception occured while enhancing confluence langchain document %s" % e) - raise e - - async def _delete_previous_information_pieces(self, index=0): - try: - await self._document_deleter.adelete_document(self._settings.document_name[index]) - except HTTPException as e: - logger.error( - ( - "Error while trying to delete documents with id: %s before uploading %s." - "NOTE: Still continuing with upload." - ), - self._settings.document_name[index], - e, - ) - - def _upload_information_pieces(self, rag_api_documents, index=0): - try: - self._rag_api.upload_information_piece(rag_api_documents) - self._key_value_store.upsert(self._settings.document_name[index], Status.READY) - logger.info("Confluence loaded successfully") - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - logger.error("Error while uploading Confluence to the database: %s", str(e)) - raise HTTPException(500, f"Error loading from Confluence: {str(e)}") from e - - def _sanitize_document_name(self, index) -> None: - document_name = ( - self._settings.document_name[index] if self._settings.document_name[index] else self._settings.url[index] - ) - document_name = document_name.replace("http://", "").replace("https://", "") - - self._settings.document_name[index] = sanitize_document_name(document_name) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py index 9f3c414..3cf671f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py @@ -41,7 +41,7 @@ def __init__(self, file_service: FileService, rag_api: RagApi, key_value_store: self._rag_api = rag_api self._key_value_store = key_value_store - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Asynchronously delete a document identified by the given identification string. @@ -55,6 +55,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Raises ------ @@ -66,7 +68,8 @@ async def adelete_document(self, identification: str) -> None: # Delete the document from file service and vector database logger.debug("Deleting existing document: %s", identification) try: - self._key_value_store.remove(identification) + if remove_from_key_value_store: + self._key_value_store.remove(identification) self._file_service.delete_file(identification) except Exception as e: error_messages += f"Error while deleting {identification} from file storage\n {str(e)}\n" diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py deleted file mode 100644 index 549be19..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Module for the DefaultDocumentUploader class.""" - -import logging -import tempfile -import traceback -import urllib -from asyncio import run -from pathlib import Path -from threading import Thread - -from fastapi import HTTPException, Request, UploadFile, status - -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.file_services.file_service import FileService -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultDocumentUploader(DocumentUploader): - """DefaultDocumentUploader is responsible for handling the upload, processing, and storage of documents.""" - - def __init__( - self, - document_extractor: ExtractorApi, - file_service: FileService, - rag_api: RagApi, - information_enhancer: InformationEnhancer, - information_mapper: InformationPiece2Document, - chunker: Chunker, - key_value_store: FileStatusKeyValueStore, - document_deleter: DocumentDeleter, - ): - """ - Initialize the DefaultDocumentUploader. - - Parameters - ---------- - document_extractor : ExtractorApi - The API for extracting documents. - file_service : FileService - The service for handling file operations on the S3 storage - rag_api : RagApi - The API for RAG backend. - information_enhancer : InformationEnhancer - The service for enhancing information. - information_mapper : InformationPiece2Document - The mapper for converting information pieces to langchain documents. - chunker : Chunker - The service for chunking documents into chunks. - key_value_store : FileStatusKeyValueStore - The key-value store for storing filename and the corresponding status. - document_deleter : DocumentDeleter - The service for deleting documents. - """ - self._document_extractor = document_extractor - self._file_service = file_service - self._rag_api = rag_api - self._information_enhancer = information_enhancer - self._information_mapper = information_mapper - self._chunker = chunker - self._key_value_store = key_value_store - self._document_deleter = document_deleter - self._background_threads = [] - - async def aupload_documents_post( - self, - body: UploadFile, - request: Request, - ) -> None: - """ - Handle the uploading of documents via a POST request. - - This asynchronous method reads the content of the uploaded file and starts a background - thread to save the document in S3 storage and the vector database. It updates the status - of the document in the key-value store and handles any exceptions that may occur during - the process. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Raises - ------ - HTTPException - If there is a ValueError, raises a 400 Bad Request error. - HTTPException - If there is any other exception, raises a 500 Internal Server Error. - """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] - content = await body.read() - body.filename = sanitize_document_name(body.filename) - try: - self._key_value_store.upsert(body.filename, Status.UPLOADING) - thread = Thread(target=lambda: run(self._asave_new_document(content, body.filename, request))) - thread.start() - self._background_threads.append(thread) - except ValueError as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) - except Exception as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) - - async def _asave_new_document( - self, - file_content: bytes, - filename: str, - request: Request, - ): - try: - await self._document_deleter.adelete_document(filename) - except HTTPException as e: - logger.error( - "Error while trying to delete file %s before uploading %s. Still continuing with upload.", filename, e - ) - self._key_value_store.upsert(filename, Status.ERROR) - - try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / filename - with open(temp_file_path, "wb") as temp_file: - logger.debug("Temporary file created at %s.", temp_file_path) - temp_file.write(file_content) - logger.debug("Temp file created and content written.") - - await self._aparse_document(Path(temp_file_path), request) - except Exception as e: - logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) - self._key_value_store.upsert(filename, Status.ERROR) - - async def _aparse_document( - self, - s3_file_path: Path, - request: Request, - ): - logger.debug("START parsing of the document %s", s3_file_path) - filename = s3_file_path.name - - self._file_service.upload_file(s3_file_path, filename) - self._key_value_store.upsert(filename, Status.PROCESSING) - - information_pieces = self._document_extractor.extract_from_file_post(ExtractionRequest(path_on_s3=filename)) - if not information_pieces: - self._key_value_store.upsert(filename, Status.ERROR) - logger.error("No information pieces found in the document: %s", filename) - raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="No information pieces found") - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] - host_base_url = str(request.base_url) - document_url = f"{host_base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(filename)}" - - chunked_documents = self._chunker.chunk(documents) - - for idx, chunk in enumerate(chunked_documents): - if chunk.metadata["id"] in chunk.metadata["related"]: - chunk.metadata["related"].remove(chunk.metadata["id"]) - chunk.metadata.update( - { - "chunk": idx, - "chunk_length": len(chunk.page_content), - "document_url": document_url, - } - ) - - enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] - - self._rag_api.upload_information_piece(rag_information_pieces) - self._key_value_store.upsert(filename, Status.READY) - logger.info("File uploaded successfully: %s", filename) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py new file mode 100644 index 0000000..fa4a27a --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -0,0 +1,210 @@ +import logging +from pathlib import Path +import traceback +from threading import Thread +import urllib +import tempfile +from contextlib import suppress + +from fastapi import UploadFile, status, HTTPException +from langchain_core.documents import Document +from asyncio import run + +from admin_api_lib.file_services.file_service import FileService +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name + +logger = logging.getLogger(__name__) + + +class DefaultFileUploader(FileUploader): + """The DefaultFileUploader is responsible for adding a new source file document to the available content.""" + + def __init__( + self, + extractor_api: ExtractorApi, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + file_service: FileService, + ): + """ + Initialize the DefaultFileUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + file_service : FileService + The service for handling file operations on the S3 storage + """ + super().__init__() + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + self._file_service = file_service + + async def upload_file( + self, + base_url: str, + file: UploadFile, + ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ + self._prune_background_threads() + + try: + file.filename = sanitize_document_name(file.filename) + source_name = f"file:{sanitize_document_name(file.filename)}" + self._check_if_already_in_processing(source_name) + self._key_value_store.upsert(source_name, Status.PROCESSING) + content = await file.read() + s3_path = await self._asave_new_document(content, file.filename, source_name) + thread = Thread( + target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) + ) # TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + + async def _handle_source_upload( + self, + s3_path: Path, + source_name: str, + file_name: str, + base_url: str, + ): + try: + information_pieces = self._extractor_api.extract_from_file_post( + ExtractionRequest(path_on_s3=str(s3_path), document_name=source_name) + ) + + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + raise Exception("No information pieces found") + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) + + chunked_documents = self._chunker.chunk(documents) + + enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + self._add_file_url(file_name, base_url, enhanced_documents) + + rag_information_pieces = [ + self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents + ] + # Replace old document + # deletion is allowed to fail + with suppress(Exception): + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) + + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("Source uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + + def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]): + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}" + for idx, chunk in enumerate(chunked_documents): + if chunk.metadata["id"] in chunk.metadata["related"]: + chunk.metadata["related"].remove(chunk.metadata["id"]) + chunk.metadata.update( + { + "chunk": idx, + "chunk_length": len(chunk.page_content), + "document_url": document_url, + } + ) + + async def _asave_new_document( + self, + file_content: bytes, + filename: str, + source_name: str, + ) -> Path: + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / filename + with open(temp_file_path, "wb") as temp_file: + logger.debug("Temporary file created at %s.", temp_file_path) + temp_file.write(file_content) + logger.debug("Temp file created and content written.") + + self._file_service.upload_file(Path(temp_file_path), filename) + return filename + except Exception as e: + logger.error("Error during document saving: %s %s", e, traceback.format_exc()) + self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py new file mode 100644 index 0000000..bc891b7 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -0,0 +1,198 @@ +import logging +import asyncio +from threading import Thread +from contextlib import suppress + +from pydantic import StrictStr +from fastapi import status, HTTPException +from langchain_core.documents import Document + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( + InformationPiece as RagInformationPiece, +) + +logger = logging.getLogger(__name__) + + +class DefaultSourceUploader(SourceUploader): + + def __init__( + self, + extractor_api: ExtractorApi, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + settings: SourceUploaderSettings, + ): + """ + Initialize the DefaultSourceUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + """ + super().__init__() + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + self._settings = settings + + async def upload_source( + self, + source_type: StrictStr, + name: StrictStr, + kwargs: list[KeyValuePair], + ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation, by default 3600.0 seconds (1 hour). + + Returns + ------- + None + """ + + self._prune_background_threads() + + source_name = f"{source_type}:{sanitize_document_name(name)}" + try: + self._check_if_already_in_processing(source_name) + self._key_value_store.upsert(source_name, Status.PROCESSING) + + thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, self._settings.timeout)) + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + + def _thread_worker(self, source_name, source_type, kwargs, timeout): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + asyncio.wait_for( + self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs), + timeout=timeout, + ) + ) + except asyncio.TimeoutError: + logger.error("Upload of %s timed out after %s seconds", source_name, timeout) + self._key_value_store.upsert(source_name, Status.ERROR) + except Exception: + logger.error("Error while uploading %s", source_name) + self._key_value_store.upsert(source_name, Status.ERROR) + finally: + loop.close() + + async def _handle_source_upload( + self, + source_name: str, + source_type: StrictStr, + kwargs: list[KeyValuePair], + ): + try: + information_pieces = self._extractor_api.extract_from_source( + ExtractionParameters( + source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs] + ) + ) + + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + raise Exception("No information pieces found") + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) + + chunked_documents = self._chunker.chunk(documents) + + # limit concurrency to avoid spawning multiple threads per call + enhanced_documents = await self._information_enhancer.ainvoke( + chunked_documents, config={"max_concurrency": 1} + ) + + rag_information_pieces: list[RagInformationPiece] = [] + for doc in enhanced_documents: + rag_information_pieces.append(self._information_mapper.document2rag_information_piece(doc)) + + with suppress(Exception): + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) + + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("Source uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py b/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py deleted file mode 100644 index 552535f..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Module for the ConfluenceSettingsMapper class.""" - -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings - - -class ConfluenceSettingsMapper: - """Mapper class for converting ConfluenceSettings to ConfluenceParameters.""" - - @staticmethod - def map_settings_to_params(settings: ConfluenceSettings, index) -> ConfluenceParameters: - """ - Map ConfluenceSettings to ConfluenceParameters. - - Parameters - ---------- - settings : ConfluenceSettings - The settings object containing Confluence configuration. - - Returns - ------- - ConfluenceParameters - The parameters object for API consumption. - """ - return ConfluenceParameters( - url=settings.url[index], - token=settings.token[index], - space_key=settings.space_key[index], - include_attachments=settings.include_attachments[index], - keep_markdown_format=settings.keep_markdown_format[index], - keep_newlines=settings.keep_newlines[index], - document_name=settings.document_name[index], - confluence_kwargs=[{"key": "verify_ssl", "value": settings.verify_ssl[index]}], - ) diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py deleted file mode 100644 index acf77fc..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Contains settings regarding the confluence.""" - -from typing import Optional -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from pydantic import Field, model_validator -from pydantic_settings import BaseSettings -import logging - -logger = logging.getLogger(__name__) - - -class ConfluenceSettings(BaseSettings): - """ - Contains configuration settings for the Confluence integration. - - Parameters - ---------- - url : CommaSeparatedStrList, optional - List of Confluence URLs. - token : CommaSeparatedStrList, optional - List of authentication tokens. - space_key : CommaSeparatedStrList, optional - List of Confluence space keys. - document_name : CommaSeparatedStrList, optional - List of document names. - verify_ssl : CommaSeparatedBoolList, optional - List of booleans indicating whether SSL verification is enabled. - include_attachments : CommaSeparatedBoolList, optional - Indicates whether to include attachments in the integration. - keep_markdown_format : CommaSeparatedBoolList, optional - Determines if markdown formatting is maintained. - keep_newlines : CommaSeparatedBoolList, optional - Indicates whether newlines are preserved. - """ - - class Config: - """Config class for reading Fields from env.""" - - env_prefix = "CONFLUENCE_" - case_sensitive = False - - url: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - token: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - space_key: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - document_name: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - verify_ssl: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - include_attachments: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_markdown_format: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_newlines: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - - @model_validator(mode="after") - def check_lists_length_consistency(cls, values): - """ - Validate that all list-valued settings have the same length. - - If not, the list is adjusted accordingly. - - Parameters - ---------- - values : dict - Dictionary of configuration settings. - - Returns - ------- - dict - The validated values dictionary with consistent list lengths. - - Raises - ------ - ValueError - If any non-optional list has a different length compared to others. - """ - # Define the keys to check - keys = [ - "url", - "token", - "space_key", - "document_name", - "verify_ssl", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - ] - - lengths = {} - for key in keys: - value = getattr(values, key, None) - if value is not None: - lengths[key] = len(value) - # If there is more than one list with values, ensure they have the same length - optional_keys = ["document_name", "verify_ssl", "include_attachments", "keep_markdown_format", "keep_newlines"] - if lengths: - # Use the first encountered length as reference - ref_length = next(iter(lengths.values())) - for key, length in lengths.items(): - if length != ref_length and key not in optional_keys: - raise ValueError( - f"Confluence Settings length mismatch: Expected all lists to have {ref_length} elements, " - f"but '{key}' has {length} elements. {lengths}" - ) - - urls = getattr(values, "url", None) - if urls and len(urls) > 0: - n = len(urls) - try: - document_name = getattr(values, "document_name", None) - if not document_name or len(document_name) == 0: - values.document_name = CommaSeparatedStrList([""] * n) - elif len(document_name) != n: - raise ValueError("document_name list length mismatch") - except ValueError as e: - logger.error(f"Error setting document_name: {e}") - logger.warning("Setting document_name to default values") - document_name = getattr(values, "document_name", []) - values.document_name = CommaSeparatedStrList(document_name + [""] * (n - len(document_name))) - - try: - verify_ssl = getattr(values, "verify_ssl", None) - if not verify_ssl or len(verify_ssl) == 0: - values.verify_ssl = CommaSeparatedBoolList([True] * n) - elif len(verify_ssl) != n: - raise ValueError("verify_ssl list length mismatch") - except ValueError as e: - logger.error(f"Error setting verify_ssl: {e}") - logger.warning("Setting verify_ssl to default values") - verify_ssl = getattr(values, "verify_ssl", []) - values.verify_ssl = CommaSeparatedBoolList(verify_ssl + [True] * (n - len(verify_ssl))) - - try: - include_attachments = getattr(values, "include_attachments", None) - if not include_attachments or len(include_attachments) == 0: - values.include_attachments = CommaSeparatedBoolList([False] * n) - elif len(include_attachments) != n: - raise ValueError("include_attachments list length mismatch") - except ValueError as e: - logger.error(f"Error setting include_attachments: {e}") - logger.warning("Setting include_attachments to default values") - include_attachments = getattr(values, "include_attachments", []) - values.include_attachments = CommaSeparatedBoolList( - include_attachments + [False] * (n - len(include_attachments)) - ) - - try: - keep_markdown_format = getattr(values, "keep_markdown_format", None) - if not keep_markdown_format or len(keep_markdown_format) == 0: - values.keep_markdown_format = CommaSeparatedBoolList([True] * n) - elif len(keep_markdown_format) != n: - raise ValueError("keep_markdown_format list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_markdown_format: {e}") - logger.warning("Setting keep_markdown_format to default values") - keep_markdown_format = getattr(values, "keep_markdown_format", []) - values.keep_markdown_format = CommaSeparatedBoolList( - keep_markdown_format + [True] * (n - len(keep_markdown_format)) - ) - - try: - keep_newlines = getattr(values, "keep_newlines", None) - if not keep_newlines or len(keep_newlines) == 0: - values.keep_newlines = CommaSeparatedBoolList([True] * n) - elif len(keep_newlines) != n: - raise ValueError("keep_newlines list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_newlines: {e}") - logger.warning("Setting keep_newlines to default values") - keep_newlines = getattr(values, "keep_newlines", []) - values.keep_newlines = CommaSeparatedBoolList(keep_newlines + [True] * (n - len(keep_newlines))) - - return values diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py new file mode 100644 index 0000000..70f18bd --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py @@ -0,0 +1,23 @@ +"""Contains settings regarding the SourceUploader.""" + +from pydantic import Field +from pydantic_settings import BaseSettings + + +class SourceUploaderSettings(BaseSettings): + """ + Contains settings regarding the SourceUploader. + + Attributes + ---------- + timeout : float + The timeout for the SourceUploader. + """ + + class Config: + """Config class for reading Fields from env.""" + + env_prefix = "SOURCE_UPLOADER_" + case_sensitive = False + + timeout: float = Field(default=3600.0, description="Timeout for the SourceUploader in seconds.") diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py deleted file mode 100644 index df23553..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Utility module to handle comma separated string input that represents boolean values.""" - -from typing import Any - - -class CommaSeparatedBoolList(list): - """ - A subclass of list that converts comma-separated strings or lists into a list of booleans. - - Notes - ----- - - For string inputs, splits the string by commas and converts recognized true values ("true", "1", "yes") to True. - - An empty or whitespace-only string returns an empty list. - - For list inputs, each element is converted to a boolean. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[bool]: - """ - Validate and convert the input into a list of booleans. - - Parameters - ---------- - v : Any - Input value, either a comma separated string or a list. - info : Any - Additional context information (unused). - - Returns - ------- - list of bool - List of booleans parsed from the input. An empty string returns an empty list. - - Raises - ------ - ValueError - If v is not a string or list. - """ - - def str_to_bool(s: str) -> bool: - return s.lower() in ("true", "1", "yes") - - if isinstance(v, str): - if v.strip() == "": - return [] - return [str_to_bool(item.strip()) for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [bool(item) for item in v] - raise ValueError("Not a valid comma separated boolean list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py deleted file mode 100644 index 7b3a2a9..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Comma Separated String List Utility Module. - -This module provides a custom list type to validate and convert inputs into -a list of strings. It splits comma separated strings and converts list elements -to strings. - -Raises ------- -ValueError - If the provided input is neither a string nor a list. -""" - -from typing import Any - - -class CommaSeparatedStrList(list): - """ - Custom list type that validates comma separated strings. - - - If input is a string: splits by commas and strips whitespace. - - If input is a list: converts all elements to strings. - - Raises - ------ - ValueError - For invalid input type. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[str]: - """ - Convert input to a validated list of strings. - - Parameters - ---------- - v : Any - A comma-separated string or a list containing items to be converted. - info : Any - Additional contextual information (not used in current implementation). - - Returns - ------- - list of str - A list of trimmed strings. Returns an empty list for an empty or whitespace-only string. - - Raises - ------ - ValueError - If the input v is neither a string nor a list. - """ - if isinstance(v, str): - if v.strip() == "": - return [] - return [item.strip() for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [str(item) for item in v] - raise ValueError("Not a valid comma separated string list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index e379f85..89b09d8 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -13,14 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List from admin_api_lib.models.status import Status try: @@ -30,7 +29,9 @@ class DocumentStatus(BaseModel): - """ """ # noqa: E501 + """ + DocumentStatus + """ # noqa: E501 name: StrictStr status: Status diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py new file mode 100644 index 0000000..7d5feeb --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -0,0 +1,105 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.models.validation_error import ValidationError + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class HTTPValidationError(BaseModel): + """ + HTTPValidationError + """ # noqa: E501 + + detail: Optional[List[ValidationError]] = None + __properties: ClassVar[List[str]] = ["detail"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return self.model_dump_json(by_alias=True, exclude_unset=True) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of HTTPValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in detail (list) + _items = [] + if self.detail: + for _item in self.detail: + if _item: + _items.append(_item.to_dict()) + _dict["detail"] = _items + # set to None if detail (nullable) is None + # and model_fields_set contains the field + if self.detail is None and "detail" in self.model_fields_set: + _dict["detail"] = None + + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of HTTPValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "detail": ( + [ValidationError.from_dict(_item) for _item in obj.get("detail")] + if obj.get("detail") is not None + else None + ) + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py new file mode 100644 index 0000000..3d46e01 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -0,0 +1,85 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class KeyValuePair(BaseModel): + """ + KeyValuePair + """ # noqa: E501 + + key: StrictStr + value: StrictStr + __properties: ClassVar[List[str]] = ["key", "value"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return self.model_dump_json(by_alias=True, exclude_unset=True) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of KeyValuePair from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of KeyValuePair from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 33f8f58..3b24b73 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -13,12 +13,12 @@ from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + try: from typing import Self except ImportError: @@ -26,7 +26,9 @@ class Status(str, Enum): - """ """ + """ + allowed enum values + """ """ allowed enum values diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error.py b/admin-api-lib/src/admin_api_lib/models/validation_error.py new file mode 100644 index 0000000..ac389ab --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error.py @@ -0,0 +1,104 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.models.validation_error_loc_inner import ValidationErrorLocInner + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ValidationError(BaseModel): + """ + ValidationError + """ # noqa: E501 + + loc: List[ValidationErrorLocInner] + msg: StrictStr + type: StrictStr + __properties: ClassVar[List[str]] = ["loc", "msg", "type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return self.model_dump_json(by_alias=True, exclude_unset=True) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in loc (list) + _items = [] + if self.loc: + for _item in self.loc: + if _item: + _items.append(_item.to_dict()) + _dict["loc"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "loc": ( + [ValidationErrorLocInner.from_dict(_item) for _item in obj.get("loc")] + if obj.get("loc") is not None + else None + ), + "msg": obj.get("msg"), + "type": obj.get("type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py new file mode 100644 index 0000000..e487669 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -0,0 +1,114 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, StrictInt, StrictStr +from typing import Any, ClassVar, Dict, List, Optional + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ValidationErrorLocInner(BaseModel): + """ + ValidationErrorLocInner + """ # noqa: E501 + + anyof_schema_1_validator: Optional[StrictStr] = None + anyof_schema_2_validator: Optional[StrictInt] = None + actual_instance: Optional[Any] = None + any_of_schemas: Optional[List[StrictStr]] = None + __properties: ClassVar[List[str]] = [ + "anyof_schema_1_validator", + "anyof_schema_2_validator", + "actual_instance", + "any_of_schemas", + ] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return self.model_dump_json(by_alias=True, exclude_unset=True) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ValidationErrorLocInner from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # set to None if anyof_schema_1_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_1_validator is None and "anyof_schema_1_validator" in self.model_fields_set: + _dict["anyof_schema_1_validator"] = None + + # set to None if anyof_schema_2_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_2_validator is None and "anyof_schema_2_validator" in self.model_fields_set: + _dict["anyof_schema_2_validator"] = None + + # set to None if actual_instance (nullable) is None + # and model_fields_set contains the field + if self.actual_instance is None and "actual_instance" in self.model_fields_set: + _dict["actual_instance"] = None + + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationErrorLocInner from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "anyof_schema_1_validator": obj.get("anyof_schema_1_validator"), + "anyof_schema_2_validator": obj.get("anyof_schema_2_validator"), + "actual_instance": obj.get("actual_instance"), + "any_of_schemas": obj.get("any_of_schemas"), + } + ) + return _obj diff --git a/admin-api-lib/tests/comma_separated_bool_list_test.py b/admin-api-lib/tests/comma_separated_bool_list_test.py deleted file mode 100644 index d6a72d3..0000000 --- a/admin-api-lib/tests/comma_separated_bool_list_test.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_validate_empty_string(): - # An empty string should return an empty list. - assert CommaSeparatedBoolList.validate("", None) == [] - - -def test_validate_string_input(): - # Test a typical comma separated string. - # "true", "yes", and "1" are considered True, all others are False. - input_str = "true, false, yes, no, 1, 0, ,TRUE, YeS" - expected = [ - True, # "true" - False, # "false" - True, # "yes" - False, # "no" - True, # "1" - False, # "0" - True, # "TRUE" - True, # "YeS" - ] - # Note: extra whitespace items are ignored. - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_commas(): - # Test string with extra commas and spaces. - input_str = "true,, yes, ,false" - expected = [True, True, False] - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_list_input(): - # When input is a list, each element is cast to bool. - input_list = [0, 1, True, False, "non-empty", ""] - expected = [ - False, # bool(0) - True, # bool(1) - True, # bool(True) - False, # bool(False) - True, # bool("non-empty") - False, # bool("") - ] - result = CommaSeparatedBoolList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - # Passing a non-string and non-list should raise a ValueError. - with pytest.raises(ValueError): - CommaSeparatedBoolList.validate(123, None) diff --git a/admin-api-lib/tests/comma_separated_str_list_test.py b/admin-api-lib/tests/comma_separated_str_list_test.py deleted file mode 100644 index a86c048..0000000 --- a/admin-api-lib/tests/comma_separated_str_list_test.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList - - -def test_validate_string(): - # simple comma separated string - input_str = "a, b, c" - expected = ["a", "b", "c"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - input_str = "a" - expected = ["a"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_spaces(): - # string with extra spaces and empty items - input_str = " apple , banana , , cherry , " - expected = ["apple", "banana", "cherry"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_empty_string(): - input_str = "" - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_only_spaces(): - input_str = " " - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_list(): - input_list = [1, "2", 3.0, " test "] - expected = ["1", "2", "3.0", " test "] - result = CommaSeparatedStrList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - with pytest.raises(ValueError): - CommaSeparatedStrList.validate(12345, None) diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py new file mode 100644 index 0000000..079a935 --- /dev/null +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -0,0 +1,136 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException +from fastapi import UploadFile + +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_file_uploader + + +@pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + + +@pytest.mark.asyncio +async def test_handle_file_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_file_post.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag = {"foo": "bar"} + information_mapper.document2rag_information_piece.return_value = dummy_rag + + uploader = DefaultFileUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), + ) + + upload_filename = "file:doc1" + + await uploader._handle_source_upload("s3path", upload_filename, "doc1.txt", "http://base") + + key_value_store.upsert.assert_any_call(upload_filename, Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag]) + document_deleter.adelete_document.assert_awaited_once_with(upload_filename, remove_from_key_value_store=False) + + +@pytest.mark.asyncio +async def test_handle_file_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_file_post.return_value = [] + + uploader = DefaultFileUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), + ) + filename = "file:doc2" + await uploader._handle_source_upload("s3path", filename, "doc2.txt", "http://base") + + key_value_store.upsert.assert_any_call(filename, Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + + +@pytest.mark.asyncio +async def test_upload_file_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc3.txt" + file.read = AsyncMock(return_value=b"") + source_name = f"file:{sanitize_document_name(file.filename)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + + uploader = DefaultFileUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), + ) + + with pytest.raises(HTTPException): + await uploader.upload_file(base_url, file) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + + +@pytest.mark.asyncio +async def test_upload_file_starts_thread(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc4.txt" + file.read = AsyncMock(return_value=b"content") + key_value_store.get_all.return_value = [] + source_name = f"file:{sanitize_document_name(file.filename)}" + + dummy_thread = MagicMock() + monkeypatch.setattr(default_file_uploader, "Thread", lambda *args, **kwargs: dummy_thread) + + uploader = DefaultFileUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), + ) + + await uploader.upload_file(base_url, file) + + key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + dummy_thread.start.assert_called_once() diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py new file mode 100644 index 0000000..9c47416 --- /dev/null +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -0,0 +1,152 @@ +# ignore: + +import asyncio +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException + +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_source_uploader + + +@pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + + +@pytest.mark.asyncio +async def test_handle_source_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # Setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_source.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag_piece = {"p": "v"} + information_mapper.document2rag_information_piece.return_value = dummy_rag_piece + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + + await uploader._handle_source_upload("source1", "type1", []) + + key_value_store.upsert.assert_any_call("source1", Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag_piece]) + document_deleter.adelete_document.assert_awaited_once_with("source1", remove_from_key_value_store=False) + + +@pytest.mark.asyncio +async def test_handle_source_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_source.return_value = [] + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + await uploader._handle_source_upload("source2", "type2", []) + + key_value_store.upsert.assert_any_call("source2", Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + + +@pytest.mark.asyncio +async def test_upload_source_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + source_type = "typeX" + name = "Doc Name" + source_name = f"{source_type}:{sanitize_document_name(name)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + with pytest.raises(HTTPException): + # use default timeout + await uploader.upload_source(source_type, name, []) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + + +@pytest.mark.asyncio +async def test_upload_source_no_timeout(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeZ" + name = "quick" + # patch Thread so no actual background work is done + dummy_thread = MagicMock() + monkeypatch.setattr(default_source_uploader, "Thread", lambda *args, **kwargs: dummy_thread) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + # should not raise + await uploader.upload_source(source_type, name, [], timeout=1.0) + # only PROCESSING status upserted, no ERROR + assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) + assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) + dummy_thread.start.assert_called_once() + + +@pytest.mark.asyncio +async def test_upload_source_timeout_error(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeTimeout" + name = "slow" + source_name = f"{source_type}:{sanitize_document_name(name)}" + + # monkey-patch the handler to sleep so that timeout triggers + async def fake_handle(self, source_name_arg, source_type_arg, kwargs_arg): + await asyncio.sleep(3600) + + # patch handler and Thread to trigger timeout synchronously + monkeypatch.setattr(default_source_uploader.DefaultSourceUploader, "_handle_source_upload", fake_handle) + + def FakeThread(target, args=(), **kwargs): + # this ensures serial execution, so that the error status can be checked + class T: + def start(self): + target(*args) + + def is_alive(self): + return False + + return T() + + monkeypatch.setattr(default_source_uploader, "Thread", FakeThread) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + # no exception should be raised; timeout path sets ERROR status + + await uploader.upload_source(source_type, name, [], timeout=1.0) + # first call marks PROCESSING, second marks ERROR + calls = [call.args for call in key_value_store.upsert.call_args_list] + assert (source_name, Status.PROCESSING) in calls + assert (source_name, Status.ERROR) in calls diff --git a/admin-api-lib/tests/settings/confluence_settings_test.py b/admin-api-lib/tests/settings/confluence_settings_test.py deleted file mode 100644 index a98fe7b..0000000 --- a/admin-api-lib/tests/settings/confluence_settings_test.py +++ /dev/null @@ -1,108 +0,0 @@ -import pytest -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_default_values(): - # When no settings are provided, all lists default to empty lists. - settings = ConfluenceSettings() - assert settings.url == CommaSeparatedStrList() - assert settings.token == CommaSeparatedStrList() - assert settings.space_key == CommaSeparatedStrList() - assert settings.document_name == CommaSeparatedStrList() - # Bool lists are empty by default if no url is provided. - assert settings.verify_ssl == CommaSeparatedBoolList() - assert settings.include_attachments == CommaSeparatedBoolList() - assert settings.keep_markdown_format == CommaSeparatedBoolList() - assert settings.keep_newlines == CommaSeparatedBoolList() - - -def test_valid_initialization_matching_lengths(): - # Provide all settings with matching lengths. - urls = "http://confluence1, http://confluence2" - tokens = "token1, token2" - space_keys = "SPACE1, SPACE2" - document_names = "Doc1, Doc2" - verify_ssl = "True, False" - include_attachments = "False, True" - keep_markdown_format = "True, True" - keep_newlines = "False, False" - - settings = ConfluenceSettings( - url=urls, - token=tokens, - space_key=space_keys, - document_name=document_names, - verify_ssl=verify_ssl, - include_attachments=include_attachments, - keep_markdown_format=keep_markdown_format, - keep_newlines=keep_newlines, - ) - - # Verify that the comma separated lists have been properly parsed. - assert settings.url == CommaSeparatedStrList(["http://confluence1", "http://confluence2"]) - assert settings.token == CommaSeparatedStrList(["token1", "token2"]) - assert settings.space_key == CommaSeparatedStrList(["SPACE1", "SPACE2"]) - assert settings.document_name == CommaSeparatedStrList(["Doc1", "Doc2"]) - assert settings.verify_ssl == CommaSeparatedBoolList([True, False]) - assert settings.include_attachments == CommaSeparatedBoolList([False, True]) - assert settings.keep_markdown_format == CommaSeparatedBoolList([True, True]) - assert settings.keep_newlines == CommaSeparatedBoolList([False, False]) - - -def test_mismatched_list_lengths(): - # Provide mismatched lengths for comma separated fields, should raise ValueError. - urls = "http://confluence1, http://confluence2, http://confluence3" - tokens = "token1, token2" # shorter than url list - with pytest.raises(ValueError): - ConfluenceSettings( - url=urls, - token=tokens, - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - - -def test_default_bool_values_when_missing(): - # Provide only url and leave bool fields empty to see if they are set to defaults. - urls = "http://confluence1, http://confluence2, http://confluence3" - settings = ConfluenceSettings( - url=urls, - token="token1, token2, token3", - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - # Defaults for bool fields: verify_ssl True, include_attachments False, - # keep_markdown_format True, keep_newlines True, for each entry. - expected_verify_ssl = CommaSeparatedBoolList([True, True, True]) - expected_include_attachments = CommaSeparatedBoolList([False, False, False]) - expected_keep_markdown_format = CommaSeparatedBoolList([True, True, True]) - expected_keep_newlines = CommaSeparatedBoolList([True, True, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines - - -def test_bool_fields_not_overwritten_when_provided(): - # Provide bool fields explicitly; they should not be overwritten by defaults. - urls = "http://confluence1, http://confluence2" - settings = ConfluenceSettings( - url=urls, - token="token1, token2", - space_key="SPACE1, SPACE2", - document_name="Doc1, Doc2", - verify_ssl="False, False", - include_attachments="True, True", - keep_markdown_format="False, False", - keep_newlines="False, True", - ) - expected_verify_ssl = CommaSeparatedBoolList([False, False]) - expected_include_attachments = CommaSeparatedBoolList([True, True]) - expected_keep_markdown_format = CommaSeparatedBoolList([False, False]) - expected_keep_newlines = CommaSeparatedBoolList([False, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index a6aea27..205d208 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -1,173 +1,153 @@ openapi: 3.0.2 info: - title: extractor-api-lib - version: 1.0.0 + title: extractor-api-lib + version: 1.0.0 servers: -- url: / + - + url: / paths: - /extract_from_file: - post: - operationId: extract_from_file_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/extraction_request' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: List of extracted information. - "422": - description: Body is not a valid PDF. - "500": - description: Something somewhere went terribly wrong. - tags: - - extractor - /extract_from_confluence: - post: - operationId: extract_from_confluence_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/confluence_parameters' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: ok - "404": - description: not found - "422": - description: unprocessable entity - "500": - description: internal server error - tags: - - extractor + /extract_from_file: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_request' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: List of extracted information. + '422': + description: Body is not a valid PDF. + '500': + description: Something somewhere went terribly wrong. + operationId: extract_from_file_post + /extract_from_source: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_parameters' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: ok + '404': + description: not found + '422': + description: unprocessable entity + '500': + description: internal server error + operationId: extract_from_source components: - schemas: - extraction_request: - description: "" - example: - path_on_s3: path on s3 - properties: - path_on_s3: - description: "" - title: PathOnS3 - type: string - required: - - path_on_s3 - title: ExtractionRequest - type: object - key_value_pair: - description: "" - example: - value: value - key: key - properties: - key: - description: "" - title: Key - value: - description: "" - title: Value - title: MetaInformationPiece - type: object - content_type: - description: "" - enum: - - IMAGE - - TABLE - - TEXT - title: InformationType - type: string - information_piece: - description: A piece of information that has been extracted. - example: - metadata: - - key: key - value: value - - key: key - value: value - page_content: some text - type: TEXT - properties: - metadata: - description: "" - items: - $ref: '#/components/schemas/key_value_pair' - title: MetaInformation - type: array - page_content: - description: "" - type: string - type: - $ref: '#/components/schemas/content_type' - required: - - metadata - - page_content - - type - title: InformationPiece - type: object - confluence_parameters: - description: "" - properties: - url: - description: url of the confluence space. - title: url - type: string - token: - description: api key to access confluence. - title: token - type: string - space_key: - description: the space key of the confluence pages. - title: space_key - type: string - include_attachments: - default: false - description: "whether to include file attachments (e.g., images, documents)\ - \ in the parsed content. Default is `false`." - title: include_attachments - type: boolean - keep_markdown_format: - default: true - description: whether to preserve markdown formatting in the output. Default - is `true`. - title: keep_markdown_format - type: boolean - keep_newlines: - default: true - description: whether to retain newline characters in the output for better - readability. Default is `true`. - title: keep_newlines - type: boolean - document_name: - description: The name that will be used to store the confluence db in the - key value db and the vectordatabase (metadata.document). - title: document_name - type: string - confluence_kwargs: - description: Additional kwargs like verify_ssl - items: - $ref: '#/components/schemas/key_value_pair' - title: confluence_kwargs - type: array - required: - - document_name - - space_key - - token - - url - title: confluence_parameters - type: object + schemas: + extraction_request: + title: ExtractionRequest + description: '' + required: + - document_name + - path_on_s3 + type: object + properties: + path_on_s3: + title: PathOnS3 + description: '' + type: string + document_name: + description: '' + type: string + example: + path_on_s3: path on s3 + key_value_pair: + title: MetaInformationPiece + description: '' + type: object + properties: + key: + title: Key + description: '' + value: + title: Value + description: '' + example: + value: value + key: key + content_type: + title: InformationType + description: '' + enum: + - IMAGE + - TABLE + - TEXT + type: string + information_piece: + title: InformationPiece + description: A piece of information that has been extracted. + required: + - metadata + - page_content + - type + type: object + properties: + metadata: + title: MetaInformation + description: '' + type: array + items: + $ref: '#/components/schemas/key_value_pair' + page_content: + description: '' + type: string + type: + $ref: '#/components/schemas/content_type' + example: + metadata: + - + key: key + value: value + - + key: key + value: value + page_content: some text + type: TEXT + extraction_parameters: + title: confluence_parameters + description: '' + required: + - document_name + - source_type + type: object + properties: + document_name: + title: document_name + description: >- + The name that will be used to store the confluence db in the key value db and the + vectordatabase (metadata.document). + type: string + kwargs: + title: confluence_kwargs + description: Kwargs for the extractor + type: array + items: + $ref: '#/components/schemas/key_value_pair' + source_type: + title: type + description: Extractortype + type: string diff --git a/extractor-api-lib/poetry.lock b/extractor-api-lib/poetry.lock index 0da6009..c750e96 100644 --- a/extractor-api-lib/poetry.lock +++ b/extractor-api-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1933,21 +1933,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -4877,4 +4877,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "9dd34ca058d74aea96a5ebfc2d712ec2a36521b310858dcb5e5569bb2dd16333" +content-hash = "a25945d5914b2ad6c32bcd50f8b787c00e41df7e09fdb3c991f48cb9e9c15c72" diff --git a/extractor-api-lib/pyproject.toml b/extractor-api-lib/pyproject.toml index 4d6ac63..a648858 100644 --- a/extractor-api-lib/pyproject.toml +++ b/extractor-api-lib/pyproject.toml @@ -92,7 +92,7 @@ html5lib = "^1.1" langchain-community = "^0.3.23" atlassian-python-api = "^4.0.3" markdownify = "^1.1.0" -langchain-core = "^0.3.58" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py deleted file mode 100644 index d1aae80..0000000 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the ConfluenceExtractor abstract base class.""" - -from abc import ABC, abstractmethod - -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.information_piece import InformationPiece - - -class ConfluenceExtractor(ABC): - """Abstract base class for extract_from_confluence endpoint.""" - - @abstractmethod - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: - """ - Extract information from confluence, using the given confluence parameters. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters used to extract information from Confluence. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces. - """ diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py index 523f159..2c9a645 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -1,13 +1,10 @@ -"""Module for the FileExtractor abstract base class.""" - from abc import ABC, abstractmethod - from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece class FileExtractor(ABC): - """Abstract base class for extract_information endpoint.""" + """Abstract base class for extract__from_file endpoint.""" @abstractmethod async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py new file mode 100644 index 0000000..4071322 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -0,0 +1,27 @@ +from abc import ABC, abstractmethod + +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.information_piece import InformationPiece + + +class SourceExtractor(ABC): + """Abstract base class for extract_from_source endpoint.""" + + @abstractmethod + async def aextract_information( + self, + extraction_parameters: ExtractionParameters, + ) -> list[InformationPiece]: + """ + Extract information from source, using the given parameters. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 418a666..4f9e4e5 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,20 +1,35 @@ """Module for the Extractor API.""" # coding: utf-8 -# noqa: D105 +from typing import Dict, List # noqa: F401 import importlib import pkgutil -from typing import List # noqa: F401 -from fastapi import APIRouter, Body # noqa: F401 - -import extractor_api_lib.impl from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +import extractor_api_lib.impl + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) + +from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece + router = APIRouter() ns_pkg = extractor_api_lib.impl @@ -23,59 +38,68 @@ @router.post( - "/extract_from_confluence", + "/extract_from_file", responses={ - 200: {"model": List[InformationPiece], "description": "ok"}, - 404: {"description": "not found"}, - 422: {"description": "unprocessable entity"}, - 500: {"description": "internal server error"}, + 200: {"model": List[InformationPiece], "description": "List of extracted information."}, + 422: {"description": "Body is not a valid PDF."}, + 500: {"description": "Something somewhere went terribly wrong."}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract_from_confluence_post( - confluence_parameters: ConfluenceParameters = Body(None, description=""), +async def extract_from_file_post( + extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: """ - Extract information from a Confluence space. + Extract information from a file based on the provided extraction request. Parameters ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. + extraction_request : ExtractionRequest + The request object containing details about the extraction process. Returns ------- List[InformationPiece] - A list of extracted information pieces from the Confluence space. + A list of extracted information pieces. """ - return await BaseExtractorApi.subclasses[0]().extract_from_confluence_post(confluence_parameters) + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) @router.post( - "/extract_from_file", + "/extract_from_source", responses={ - 200: {"model": List[InformationPiece], "description": "List of extracted information."}, - 422: {"description": "Body is not a valid PDF."}, - 500: {"description": "Something somewhere went terribly wrong."}, + 200: {"model": List[InformationPiece], "description": "ok"}, + 404: {"description": "not found"}, + 422: {"description": "unprocessable entity"}, + 500: {"description": "internal server error"}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract_from_file_post( - extraction_request: ExtractionRequest = Body(None, description=""), +async def extract_from_source( + extraction_parameters: ExtractionParameters = Body(None, description=""), ) -> List[InformationPiece]: """ - Extract information from a file based on the provided extraction request. + Extract information from a source based on the provided extraction parameters. Parameters ---------- - extraction_request : ExtractionRequest + extraction_parameters : ExtractionParameters, optional The request object containing details about the extraction process. Returns ------- List[InformationPiece] A list of extracted information pieces. + + Raises + ------ + HTTPException + If the extraction process fails or encounters an error. """ - return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index 8f03f9c..800c214 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -1,11 +1,10 @@ """Module for the base ExtractorApi interface.""" # coding: utf-8 -# flake8: noqa: D105 -from typing import ClassVar, List, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece @@ -26,35 +25,35 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - async def extract_from_confluence_post( + async def extract_from_file_post( self, - confluence_parameters: ConfluenceParameters, + extraction_request: ExtractionRequest, ) -> List[InformationPiece]: """ - Extract information from a Confluence space. + Extract information from a file based on the provided extraction request. Parameters ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. + extraction_request : ExtractionRequest + The request object containing details about the extraction process. Returns ------- List[InformationPiece] - A list of extracted information pieces from the Confluence space. + A list of extracted information pieces. """ - async def extract_from_file_post( + async def extract_from_source( self, - extraction_request: ExtractionRequest, + extraction_parameters: ExtractionParameters, ) -> List[InformationPiece]: """ - Extract information from a file based on the provided extraction request. + Extract information from a source based on the provided extraction request. Parameters ---------- - extraction_request : ExtractionRequest - The request object containing details about the extraction process. + extraction_parameters : ExtractionParameters + The parameters required to access and extract information from the source. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index e3bcaf1..ad671d9 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -3,16 +3,12 @@ from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import List, Singleton # noqa: WOT001 -from extractor_api_lib.impl.api_endpoints.default_confluence_extractor import ( - DefaultConfluenceExtractor, -) -from extractor_api_lib.impl.api_endpoints.default_file_extractor import ( - DefaultFileExtractor, -) -from extractor_api_lib.impl.document_parser.general_extractor import GeneralExtractor -from extractor_api_lib.impl.document_parser.ms_docs_extractor import MSDocsExtractor -from extractor_api_lib.impl.document_parser.pdf_extractor import PDFExtractor -from extractor_api_lib.impl.document_parser.xml_extractor import XMLExtractor +from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor +from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor +from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor +from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor +from extractor_api_lib.impl.api_endpoints.general_file_extractor import GeneralFileExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -40,11 +36,13 @@ class DependencyContainer(DeclarativeContainer): intern2external = Singleton(Internal2ExternalInformationPiece) langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) - all_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) + file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_extractor = Singleton(GeneralExtractor, file_service, all_extractors) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external) + confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - file_extractor = Singleton( - DefaultFileExtractor, information_extractor=general_extractor, file_service=file_service, mapper=intern2external + source_extractor = Singleton( + GeneralSourceExtractor, + mapper=intern2external, + available_extractors=List(confluence_extractor), ) - confluence_extractor = Singleton(DefaultConfluenceExtractor, mapper=langchain_document2information_piece) diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/extractors/__init__.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/file_services/__init__.py rename to extractor-api-lib/src/extractor_api_lib/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py new file mode 100644 index 0000000..3a6ee68 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -0,0 +1,35 @@ +"""Module for the Base class for Information extractors.""" + +from abc import ABC, abstractmethod + + +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + + +class InformationExtractor(ABC): + """Base class for Information extractors.""" + + @property + @abstractmethod + def extractor_type(self) -> ExtractorTypes: ... + + @abstractmethod + async def aextract_content( + self, + extraction_parameters: ExtractionParameters, + ) -> list[InternalInformationPiece]: + """ + Extract content from source. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py similarity index 78% rename from extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py rename to extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 0c3c4ce..7897c19 100644 --- a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -3,13 +3,13 @@ from abc import ABC, abstractmethod from pathlib import Path -from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.file_services.file_service import FileService -class InformationExtractor(ABC): - """Base class for Information extractors.""" +class InformationFileExtractor(ABC): + """Base class for Information file extractors.""" def __init__(self, file_service: FileService): """Initialize the InformationExtractor. @@ -34,7 +34,7 @@ def compatible_file_types(self) -> list[FileType]: """ @abstractmethod - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from given file. @@ -42,6 +42,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: ---------- file_path : Path Path to the file the information should be extracted from. + name : str + Name of the document. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py deleted file mode 100644 index b752f6c..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Module for the DefaultConfluenceExtractor class.""" - -from langchain_community.document_loaders import ConfluenceLoader - -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor -from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( - ConfluenceLangchainDocument2InformationPiece, -) -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.information_piece import InformationPiece - - -class DefaultConfluenceExtractor(ConfluenceExtractor): - """Default implementation of the FileExtractor interface.""" - - MIN_PAGE_CONTENT_LENGTH = 10 - - def __init__( - self, - mapper: ConfluenceLangchainDocument2InformationPiece, - ): - """ - Initialize the DefaultConfluenceExtractor. - - Parameters - ---------- - mapper : ConfluenceLangchainDocument2InformationPiece - An instance of ConfluenceLangchainDocument2InformationPiece used for mapping langchain documents - to information pieces. - """ - self.mapper = mapper - - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: - """ - Asynchronously extracts information pieces from Confluence. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters required to connect to and extract data from Confluence. - - Returns - ------- - list[InformationPiece] - A list of information pieces extracted from Confluence. - """ - self.mapper.confluence_parameters = confluence_parameters - confluence_kwargs = {} - for ckwargs in confluence_parameters.confluence_kwargs: - confluence_kwargs[ckwargs.key] = ckwargs.value - confluence_loader_parameters = confluence_parameters.model_dump() - confluence_loader_parameters["confluence_kwargs"] = confluence_kwargs - # Drop the document_name parameter as it is not used by the ConfluenceLoader - confluence_loader_parameters.pop("document_name", None) - document_loader = ConfluenceLoader(**confluence_loader_parameters) - documents = document_loader.load() - return [self.mapper.map_document2informationpiece(x) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py deleted file mode 100644 index 787997b..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Module for the DefaultFileExtractor class.""" - -import tempfile -from pathlib import Path - -from extractor_api_lib.api_endpoints.file_extractor import FileExtractor -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.mapper.internal2external_information_piece import ( - Internal2ExternalInformationPiece, -) -from extractor_api_lib.models.extraction_request import ExtractionRequest -from extractor_api_lib.models.information_piece import InformationPiece - - -class DefaultFileExtractor(FileExtractor): - """Default implementation of the FileExtractor interface.""" - - def __init__( - self, - information_extractor: InformationExtractor, - file_service: FileService, - mapper: Internal2ExternalInformationPiece, - ): - """ - Initialize the DefaultFileExtractor. - - Parameters - ---------- - information_extractor : InformationExtractor - An instance of InformationExtractor to extract information from files. - file_service : FileService - An instance of FileService to handle file operations. - mapper : Internal2ExternalInformationPiece - An instance of Internal2ExternalInformationPiece to map internal information to external format. - """ - self.information_extractor = information_extractor - self.file_service = file_service - self.mapper = mapper - - async def aextract_information( - self, - extraction_request: ExtractionRequest, - ) -> list[InformationPiece]: - """ - Extract information from a document specified in the extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing details about the document to be extracted, including its path on S3. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the document, where each piece contains non-null page content. - """ - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / extraction_request.path_on_s3 - - with open(temp_file_path, "wb") as temp_file: - self.file_service.download_file(extraction_request.path_on_s3, temp_file) - - results = self.information_extractor.extract_content(temp_file_path) - return [self.mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py new file mode 100644 index 0000000..fee7db2 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -0,0 +1,80 @@ +"""Module for the GeneralExtractor class.""" + +import logging +from pathlib import Path +import tempfile +import traceback + + +from extractor_api_lib.api_endpoints.file_extractor import FileExtractor +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.models.extraction_request import ExtractionRequest +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor +from extractor_api_lib.models.information_piece import InformationPiece + +logger = logging.getLogger(__name__) + + +class GeneralFileExtractor(FileExtractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__( + self, + file_service: FileService, + available_extractors: list[InformationFileExtractor], + mapper: Internal2ExternalInformationPiece, + ): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + file_service : FileService + An instance of FileService to handle file operations. + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + """ + self._file_service = file_service + self._available_extractors = available_extractors + self._mapper = mapper + + async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / Path(extraction_request.path_on_s3).name + with open(temp_file_path, "wb") as temp_file: + self._file_service.download_file(extraction_request.path_on_s3, temp_file) + logger.debug("Temporary file created at %s.", temp_file_path) + logger.debug("Temp file created and content written.") + file_type = str(temp_file_path).split(".")[-1].upper() + correct_extractors = [ + x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] + ] + if not correct_extractors: + raise ValueError(f"No extractor found for file-ending {file_type}") + results = await correct_extractors[-1].aextract_content( + temp_file_path, extraction_request.document_name + ) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] + except Exception as e: + logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) + raise e diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py new file mode 100644 index 0000000..10d8cd5 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -0,0 +1,60 @@ +"""Module for the DefaultFileExtractor class.""" + +import logging + +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor + + +logger = logging.getLogger(__name__) + + +class GeneralSourceExtractor(SourceExtractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__(self, available_extractors: list[InformationExtractor], mapper: Internal2ExternalInformationPiece): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + mapper : Internal2ExternalInformationPiece + Mapper for mapping the internal represantation to the external one. + """ + self._mapper = mapper + self._available_extractors = available_extractors + + async def aextract_information( + self, + extraction_parameters: ExtractionParameters, + ) -> list[InformationPiece]: + """ + Extract information from source, using the given parameters. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ + correct_extractors = [ + x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type + ] + if not correct_extractors: + raise ValueError(f"No extractor found for type {extraction_parameters.source_type}") + results = await correct_extractors[-1].aextract_content(extraction_parameters) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py deleted file mode 100644 index 05946bf..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Module for the GeneralExtractor class.""" - -from pathlib import Path - -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece - - -class GeneralExtractor(InformationExtractor): - """A class to extract information from documents using available extractors. - - This class serves as a general extractor that utilizes a list of available - information extractors to extract content from documents. It determines the - appropriate extractor based on the file type of the document. - """ - - def __init__(self, file_service: FileService, available_extractors: list[InformationExtractor]): - """ - Initialize the GeneralExtractor. - - Parameters - ---------- - file_service : FileService - An instance of FileService to handle file operations. - available_extractors : list of InformationExtractor - A list of available information extractors to be used by the GeneralExtractor. - """ - super().__init__(file_service=file_service) - - self._available_extractors = available_extractors - - @property - def compatible_file_types(self) -> list[FileType]: - """ - List of compatible file types for the document parser. - - Returns - ------- - list[FileType] - A list containing the compatible file types. By default, it returns a list with FileType.NONE. - """ - return [FileType.NONE] - - def extract_content(self, file_path: Path) -> list[InformationPiece]: - """ - Extract content from given file. - - Parameters - ---------- - file_path : Path - Path to the file the information should be extracted from. - - Returns - ------- - list[InformationPiece] - The extracted information. - """ - file_type = str(file_path).split(".")[-1].upper() - correct_extractors = [ - x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] - ] - if not correct_extractors: - raise ValueError(f"No extractor found for file-ending {file_type}") - return correct_extractors[-1].extract_content(file_path) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index d4a3760..b1aa8c1 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,15 +1,15 @@ """Module for the implementation of the ExtractorApi interface.""" -from dependency_injector.wiring import Provide, inject from fastapi import Depends +from dependency_injector.wiring import Provide, inject -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor from extractor_api_lib.api_endpoints.file_extractor import FileExtractor -from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -from extractor_api_lib.dependency_container import DependencyContainer -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi +from extractor_api_lib.dependency_container import DependencyContainer class ExtractorApiImpl(BaseExtractorApi): @@ -19,7 +19,7 @@ class ExtractorApiImpl(BaseExtractorApi): async def extract_from_file_post( self, extraction_request: ExtractionRequest, - file_extractor: FileExtractor = Depends(Provide[DependencyContainer.file_extractor]), + extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), ) -> list[InformationPiece]: """ Extract information from a file based on the provided extraction request. @@ -28,35 +28,34 @@ async def extract_from_file_post( ---------- extraction_request : ExtractionRequest The request containing details about the extraction process. - file_extractor : FileExtractor, optional - The file extractor dependency, by default Depends(Provide[DependencyContainer.file_extractor]). + extractor : FileExtractor, optional + The file extractor dependency. Returns ------- list[InformationPiece] A list of extracted information pieces. """ - return await file_extractor.aextract_information(extraction_request) + return await extractor.aextract_information(extraction_request) - @inject - async def extract_from_confluence_post( + async def extract_from_source( self, - confluence_parameters: ConfluenceParameters, - confluence_extractor: ConfluenceExtractor = Depends(Provide[DependencyContainer.confluence_extractor]), + extraction_parameters: ExtractionParameters, + extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), ) -> list[InformationPiece]: """ - Extract information from Confluence asynchronously. + Extract information from a source (e.g. confluence) asynchronously. Parameters ---------- - confluence_parameters : ConfluenceParameters - Parameters required to extract information from Confluence. - confluence_extractor : ConfluenceExtractor, optional - The Confluence extractor instance (default is provided by DependencyContainer). + extraction_parameters : ExtractionParameters + Parameters required to extract information from source. + extractor : SourceExtractor, optional + The source extractor instance. Returns ------- list[InformationPiece] - A list of extracted information pieces from the configured Confluence space. + A list of extracted information pieces. """ - return await confluence_extractor.aextract_from_confluence(confluence_parameters) + return await extractor.aextract_information(extraction_parameters) diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/impl/utils/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py new file mode 100644 index 0000000..f1c15a6 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -0,0 +1,62 @@ +"""Module for the DefaultConfluenceExtractor class.""" + +from langchain_community.document_loaders import ConfluenceLoader + +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( + ConfluenceLangchainDocument2InformationPiece, +) + + +class ConfluenceExtractor(InformationExtractor): + """Implementation of the InformationExtractor interface for confluence.""" + + def __init__( + self, + mapper: ConfluenceLangchainDocument2InformationPiece, + ): + """ + Initialize the ConfluenceExtractor. + + Parameters + ---------- + mapper : ConfluenceLangchainDocument2InformationPiece + An instance of ConfluenceLangchainDocument2InformationPiece used for mapping langchain documents + to information pieces. + """ + self.mapper = mapper + + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.CONFLUENCE + + async def aextract_content( + self, + extraction_parameters: ExtractionParameters, + ) -> list[InternalInformationPiece]: + """ + Asynchronously extracts information pieces from Confluence. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters required to connect to and extract data from Confluence. + + Returns + ------- + list[InternalInformationPiece] + A list of information pieces extracted from Confluence. + """ + # Convert list of key value pairs to dict + confluence_loader_parameters = { + x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs + } + # Drop the document_name parameter as it is not used by the ConfluenceLoader + if "document_name" in confluence_loader_parameters: + confluence_loader_parameters.pop("document_name", None) + document_loader = ConfluenceLoader(**confluence_loader_parameters) + documents = document_loader.load() + return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/admin-api-lib/tests/settings/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py similarity index 100% rename from admin-api-lib/tests/settings/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py similarity index 89% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index 8bb23ca..5201c62 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -10,18 +10,19 @@ from unstructured.partition.docx import partition_docx from unstructured.partition.pptx import partition_pptx -from extractor_api_lib.document_parser.information_extractor import InformationExtractor + from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter logger = logging.getLogger(__name__) -class MSDocsExtractor(InformationExtractor): +class MSDocsExtractor(InformationFileExtractor): """Extractor for Microsoft Documents (DOCX and PPTX) using unstructured library.""" def __init__(self, file_service: FileService, dataframe_converter: DataframeConverter): @@ -50,7 +51,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.DOCX, FileType.PPTX] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from a given file based on its extension. @@ -58,7 +59,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: ---------- file_path : Path The path to the file from which content is to be extracted. - + name : str + Name of the document. Returns ------- list[InformationPiece] @@ -92,8 +94,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] page_content_lines: list[tuple[str, str]] = [] current_page: int = 1 old_page: int = 1 @@ -118,7 +120,7 @@ def _process_element( self, el: Element, page_content_lines: list[tuple[str, str]], - processed_elements: list[InformationPiece], + processed_elements: list[InternalInformationPiece], document_name: str, current_page: int, ) -> None: @@ -154,7 +156,7 @@ def _process_table(self, el: Element, page_content_lines: list[tuple[str, str]]) def _create_text_piece( self, document_name: str, page: int, page_content_lines: list[tuple[str, str]] - ) -> InformationPiece: + ) -> InternalInformationPiece: content = "\n".join([content for _, content in page_content_lines]) return self._create_information_piece(document_name, page, content, ContentType.TEXT) @@ -165,7 +167,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -174,7 +176,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py similarity index 94% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index beaee14..928998f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -14,19 +14,20 @@ from pdf2image import convert_from_path from pdfplumber.page import Page -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService + from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor logger = logging.getLogger(__name__) -class PDFExtractor(InformationExtractor): +class PDFExtractor(InformationFileExtractor): """PDFExtractor is a class responsible for extracting information from PDF files. It converts PDF pages to images, identifies table/figure coordinates, and extracts @@ -35,7 +36,7 @@ class PDFExtractor(InformationExtractor): Attributes ---------- TITLE_PATTERN : re.Pattern - Regular expression pattern to identify titles in the text. + Regular expression pattern to identify titles in the text.document TITLE_PATTERN_MULTILINE : re.Pattern Regular expression pattern to identify titles in the text with multiline support. """ @@ -86,7 +87,7 @@ def _create_information_piece( content_type: ContentType, information_id: str, additional_meta: Optional[dict] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -96,19 +97,21 @@ def _create_information_piece( } if additional_meta: metadata = metadata | additional_meta - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, ) - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """Extract content from given file. Parameters ---------- file_path : Path Path to the file the information should be extracted from. + name : str + Name of the document. Returns ------- @@ -134,7 +137,7 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: page=page, temp_dir=temp_dir, title=current_title, - document_name=file_path.name, + document_name=name, ) pdf_elements += new_pdf_elements @@ -147,7 +150,7 @@ def _extract_tabluar_data( document_name: str, text_x_tolerance: int = 1, text_y_tolerance: int = 1, - ) -> list[InformationPiece]: + ) -> list[InternalInformationPiece]: return_value = [] pdfplumber_tables = page.find_tables() table_strings = [] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py similarity index 84% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index 3478cab..d72292a 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -5,20 +5,21 @@ from pathlib import Path from typing import Any, Optional + from unstructured.documents.elements import Element from unstructured.partition.xml import partition_xml -from extractor_api_lib.document_parser.information_extractor import InformationExtractor from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) -class XMLExtractor(InformationExtractor): +class XMLExtractor(InformationFileExtractor): """Extractor for XML documents using unstructured library.""" def __init__(self, file_service: FileService): @@ -43,7 +44,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.XML] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from an XML file and processes the elements. @@ -51,6 +52,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: ---------- file_path : Path The path to the XML file to be processed. + name : str + Name of the document. Returns ------- @@ -60,8 +63,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: elements = partition_xml(filename=file_path.as_posix(), xml_keep_tags=False) return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] content_lines: list[tuple[str, str]] = [] for el in elements: @@ -86,7 +89,7 @@ def _sanitize_text(self, text: str) -> str: text = re.sub(r"\s+", " ", text) return text.strip() - def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InformationPiece: + def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InternalInformationPiece: content = "\n".join([content for _, content in content_lines]) return self._create_information_piece(document_name, content, ContentType.TEXT) @@ -96,7 +99,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "id": hash_datetime(), @@ -104,7 +107,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 96e6efe..a7bcb0d 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -2,10 +2,8 @@ from langchain_core.documents import Document as LangchainDocument -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.content_type import ContentType -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece class ConfluenceLangchainDocument2InformationPiece: @@ -35,35 +33,9 @@ class ConfluenceLangchainDocument2InformationPiece: USE_CASE_RELATED_KEY = "related" DOCUMENT_KEY = "document" - def __init__(self) -> None: - """Initialize the ConfluenceLangchainDocument2InformationPiece instance.""" - self._confluence_parameters = None - - @property - def confluence_parameters(self): - """ - Property that returns the Confluence parameters. - - Returns - ------- - dict - A dictionary containing the Confluence parameters. - """ - return self._confluence_parameters - - @confluence_parameters.setter - def confluence_parameters(self, confluence_parameters: ConfluenceParameters): - """ - Set the confluence parameters. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The confluence parameters to be set. - """ - self._confluence_parameters = confluence_parameters - - def map_document2informationpiece(self, document: LangchainDocument) -> InformationPiece: + def map_document2informationpiece( + self, document: LangchainDocument, document_name: str + ) -> InternalInformationPiece: """ Map a LangchainDocument to an InformationPiece. @@ -82,27 +54,18 @@ def map_document2informationpiece(self, document: LangchainDocument) -> Informat ValueError If Confluence parameters are not set before mapping documents. """ - if self._confluence_parameters is None: - raise ValueError("Confluence parameters must be set before mapping documents") - - meta = self._map_meta(document.metadata) - return InformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) + meta = self._map_meta(document.metadata, document_name) + return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - def _map_meta(self, internal: dict) -> list[MetaInformationPiece]: - metadata = [] + def _map_meta(self, internal: dict, document_name: str) -> dict: + metadata = {} for key, value in internal.items(): - metadata.append( - MetaInformationPiece( - key=self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key, - value=value, - ) - ) - page_title_matches = [m.value for m in metadata if m.key == self.CONFLUENCE_LOADER_TITLE_KEY] + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key] = value + + page_title_matches = [v for k, v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" - metadata.append(MetaInformationPiece(key=self.USER_CASE_PAGE_KEY, value=page_title)) - metadata.append( - MetaInformationPiece(key=self.DOCUMENT_KEY, value=self._confluence_parameters.document_name) - ) - metadata.append(MetaInformationPiece(key=self.USE_CASE_RELATED_KEY, value=[])) + metadata[self.USER_CASE_PAGE_KEY] = page_title + metadata[self.DOCUMENT_KEY] = document_name + metadata[self.USE_CASE_RELATED_KEY] = [] return metadata diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index a4da430..6c4d6b8 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -2,12 +2,8 @@ from extractor_api_lib.impl.types.content_type import ContentType as InternalContentType from extractor_api_lib.models.content_type import ContentType as ExternalContentType -from extractor_api_lib.models.dataclasses.information_piece import ( - InformationPiece as InternalInformationPiece, -) -from extractor_api_lib.models.information_piece import ( - InformationPiece as ExternalInformationPiece, -) +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece @@ -27,7 +23,7 @@ class Internal2ExternalInformationPiece: InternalContentType.TABLE: ExternalContentType.TABLE, } - def map_internal_to_external(self, internal: InternalInformationPiece) -> ExternalInformationPiece: + def map_internal_to_external(self, internal: InternalInformationPiece) -> InformationPiece: """Map an InternalInformationPiece object to an ExternalInformationPiece object. Parameters @@ -42,7 +38,7 @@ def map_internal_to_external(self, internal: InternalInformationPiece) -> Extern """ information_type = self._map_information_type(internal.type) meta = self._map_meta(internal.metadata) - return ExternalInformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) + return InformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) def _map_information_type(self, internal: InternalContentType) -> ExternalContentType: return self.TYPE_LOOKUP_TABLE[internal] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py new file mode 100644 index 0000000..8a9a403 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py @@ -0,0 +1,9 @@ +from enum import StrEnum + + +class ExtractorTypes(StrEnum): + """Enum describing the type of information source.""" + + FILE = "file" + CONFLUENCE = "confluence" + NONE = "None" diff --git a/extractor-api-lib/src/extractor_api_lib/models/content_type.py b/extractor-api-lib/src/extractor_api_lib/models/content_type.py index 4e362d3..ff7be41 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/content_type.py +++ b/extractor-api-lib/src/extractor_api_lib/models/content_type.py @@ -13,12 +13,12 @@ from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + try: from typing import Self except ImportError: diff --git a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py similarity index 92% rename from extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py rename to extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py index 7bd609a..f0699e4 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py @@ -6,7 +6,7 @@ @dataclasses.dataclass -class InformationPiece: +class InternalInformationPiece: """Dataclass holding the information found in a document.""" type: ContentType # noqa: A003 # type of the information diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py new file mode 100644 index 0000000..e903b4e --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -0,0 +1,104 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from extractor_api_lib.models.key_value_pair import KeyValuePair + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + return self.model_dump_json(by_alias=True, exclude_unset=True) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item in self.kwargs: + if _item: + _items.append(_item.to_dict()) + _dict["kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None + else None + ), + "source_type": obj.get("source_type"), + } + ) + return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 3290aa7..3befa42 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -13,13 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -31,7 +31,8 @@ class ExtractionRequest(BaseModel): """ """ # noqa: E501 path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] model_config = { "populate_by_name": True, @@ -78,5 +79,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 440f7a3..3ffb308 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -13,14 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List from extractor_api_lib.models.content_type import ContentType from extractor_api_lib.models.key_value_pair import KeyValuePair diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index bdc5bb2..3cba505 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -13,13 +13,13 @@ from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json + from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional try: from typing import Self diff --git a/extractor-api-lib/tests/dummy_test.py b/extractor-api-lib/tests/dummy5_test.py similarity index 100% rename from extractor-api-lib/tests/dummy_test.py rename to extractor-api-lib/tests/dummy5_test.py diff --git a/rag-core-api/poetry.lock b/rag-core-api/poetry.lock index e5ea53f..9812609 100644 --- a/rag-core-api/poetry.lock +++ b/rag-core-api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1959,21 +1959,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3843,6 +3843,7 @@ deprecated = "^1.2.18" flashrank = "^0.2.10" langchain = "^0.3.25" langchain-community = "0.3.23" +langchain-core = "0.3.63" langfuse = "^2.60.4" oauthlib = "^3.2.2" openai = "^1.77.0" diff --git a/rag-core-api/pyproject.toml b/rag-core-api/pyproject.toml index 4fd633c..2194a90 100644 --- a/rag-core-api/pyproject.toml +++ b/rag-core-api/pyproject.toml @@ -118,8 +118,8 @@ known_local_folder = ["rag_core_api", "rag_core_lib"] max-line-length = 120 [tool.pytest.ini_options] -log_cli = 1 +log_cli = true log_cli_level = "DEBUG" -pythonpath = "src" -testpaths = "src/tests" +pythonpath = ["src", "tests"] +testpaths = "tests" diff --git a/rag-core-api/tests/rag_api_test.py b/rag-core-api/tests/rag_api_test.py index 372709c..2cbdf8e 100644 --- a/rag-core-api/tests/rag_api_test.py +++ b/rag-core-api/tests/rag_api_test.py @@ -14,23 +14,23 @@ from qdrant_client import QdrantClient from qdrant_client.http import models -from .mock_environment_variables import mock_environment_variables -from .mock_logging_directory import mock_logging_config +from mock_environment_variables import mock_environment_variables +from mock_logging_directory import mock_logging_config mock_environment_variables() mock_logging_config() -from src.rag_core_api.main import app -from src.rag_core_api.models.chat_request import ChatRequest -from src.rag_core_api.models.chat_history import ChatHistory -from src.rag_core_api.models.chat_history_message import ChatHistoryMessage -from src.rag_core_api.models.chat_role import ChatRole -from src.rag_core_api.models.information_piece import InformationPiece -from src.rag_core_api.models.content_type import ContentType -from src.rag_core_api.models.key_value_pair import KeyValuePair -from src.rag_core_api.models.delete_request import DeleteRequest -from src.rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings -from src.rag_core_api.impl.settings.error_messages import ErrorMessages +from rag_core_api.main import app +from rag_core_api.models.chat_request import ChatRequest +from rag_core_api.models.chat_history import ChatHistory +from rag_core_api.models.chat_history_message import ChatHistoryMessage +from rag_core_api.models.chat_role import ChatRole +from rag_core_api.models.information_piece import InformationPiece +from rag_core_api.models.content_type import ContentType +from rag_core_api.models.key_value_pair import KeyValuePair +from rag_core_api.models.delete_request import DeleteRequest +from rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings +from rag_core_api.impl.settings.error_messages import ErrorMessages @pytest_asyncio.fixture diff --git a/rag-core-lib/poetry.lock b/rag-core-lib/poetry.lock index 90b3fb9..4487b8e 100644 --- a/rag-core-lib/poetry.lock +++ b/rag-core-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1623,21 +1623,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3384,4 +3384,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "2aa5df2f5304dfb56d7adfeeb4f8817ecf9d7eaaadc5af9127875a5aa442c7d0" +content-hash = "265d9eb8b910f4831f5e5e7e78a0e9b3b010793fed03d30a96393a2f8c1792db" diff --git a/rag-core-lib/pyproject.toml b/rag-core-lib/pyproject.toml index c63b316..2ca85e3 100644 --- a/rag-core-lib/pyproject.toml +++ b/rag-core-lib/pyproject.toml @@ -21,6 +21,7 @@ requests-oauthlib = "^2.0.0" langfuse = "^2.60.4" deprecated = "^1.2.18" openai = "^1.77.0" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] diff --git a/rag-core-lib/tests/dummy_test.py b/rag-core-lib/tests/dummy6_test.py similarity index 100% rename from rag-core-lib/tests/dummy_test.py rename to rag-core-lib/tests/dummy6_test.py