From 41d3b18430c155588c88ad12edc16afecd904489 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:35:55 -0400 Subject: [PATCH 1/6] spelling: asynchronous Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- USAGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/USAGE.md b/USAGE.md index 63ba6954..3d063782 100644 --- a/USAGE.md +++ b/USAGE.md @@ -31,7 +31,7 @@ if res.elements is not None:
-The same SDK client can also be used to make asychronous requests by importing asyncio. +The same SDK client can also be used to make asynchronous requests by importing asyncio. ```python # Asynchronous Example import asyncio From 1175dd54f508c86f6c7ae906742a991269d5a921 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:36:15 -0400 Subject: [PATCH 2/6] spelling: concurrency Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- src/unstructured_client/_hooks/custom/form_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index 54fb06b3..b60d2c0e 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -125,7 +125,7 @@ def get_split_pdf_allow_failed_param( def get_split_pdf_concurrency_level_param( form_data: FormData, key: str, fallback_value: int, max_allowed: int ) -> int: - """Retrieves the value for concurreny level that should be used for splitting pdf. + """Retrieves the value for concurrency level that should be used for splitting pdf. In case given the number is not a valid integer or less than 1, it will use the default value. From 533b6f93a1c5c9a4c2c28186ef370bf6b0324a44 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:36:26 -0400 Subject: [PATCH 3/6] spelling: encounter Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ce3a008..daf608ad 100755 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ See [page splitting](https://docs.unstructured.io/api-reference/api-services/sdk In order to speed up processing of large PDF files, the client splits up PDFs into smaller files, sends these to the API concurrently, and recombines the results. `split_pdf_page` can be set to `False` to disable this. The amount of workers utilized for splitting PDFs is dictated by the `split_pdf_concurrency_level` parameter, with a default of 5 and a maximum of 15 to keep resource usage and costs in check. The splitting process leverages `asyncio` to manage concurrency effectively. -The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document. Because the splitting process uses `asyncio` the client can encouter event loop issues if it is nested in another async runner, like running in a `gevent` spawned task. Instead, this is safe to run in multiprocessing workers (e.g., using `multiprocessing.Pool` with `fork` context). +The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document. Because the splitting process uses `asyncio` the client can encounter event loop issues if it is nested in another async runner, like running in a `gevent` spawned task. Instead, this is safe to run in multiprocessing workers (e.g., using `multiprocessing.Pool` with `fork` context). Example: ```python From 857867fe72984866d2b6af44cac0ff0ae00e4ebd Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 20:14:00 -0400 Subject: [PATCH 4/6] spelling: github Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index daf608ad..b661f698 100755 --- a/README.md +++ b/README.md @@ -369,9 +369,9 @@ There are two important files used by `make client-generate`: 1. `openapi.json` which is actually not stored here, [but fetched from unstructured-api](https://api.unstructured.io/general/openapi.json), represents the API that is supported on backend. 2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json` which is used to generate SDK. -Once PR with changes is merged, Github CI will autogenerate the Speakeasy client in a new PR, using +Once PR with changes is merged, GitHub CI will autogenerate the Speakeasy client in a new PR, using the `openapi.json` and `overlay_client.yaml` You will have to manually bring back the human created lines in it. -Feel free to open a PR or a Github issue as a proof of concept and we'll do our best to include it in a future release! +Feel free to open a PR or a GitHub issue as a proof of concept and we'll do our best to include it in a future release! ### SDK Created by [Speakeasy](https://www.speakeasyapi.dev/docs/sdk-design/python/methodology-python) From d05c9c9e0c7798aabf08c09337853a687a6bf112 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 20:14:35 -0400 Subject: [PATCH 5/6] spelling: params Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- src/unstructured_client/utils/queryparams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unstructured_client/utils/queryparams.py b/src/unstructured_client/utils/queryparams.py index 130b31e2..87f0df46 100644 --- a/src/unstructured_client/utils/queryparams.py +++ b/src/unstructured_client/utils/queryparams.py @@ -66,10 +66,10 @@ def _populate_query_params( f_name = field.alias if field.alias is not None else name serialization = metadata.serialization if serialization is not None: - serialized_parms = _get_serialized_params( + serialized_params = _get_serialized_params( metadata, f_name, value, param_field_types[name] ) - for key, value in serialized_parms.items(): + for key, value in serialized_params.items(): if key in query_param_values: query_param_values[key].extend(value) else: From 931394b4258a44b2e3c64ed7b10cd92a07427a1b Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:37:20 -0400 Subject: [PATCH 6/6] spelling: resulting Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- overlay_client.yaml | 2 +- src/unstructured_client/models/shared/partition_parameters.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/overlay_client.yaml b/overlay_client.yaml index f36cdc73..c5e0ad8f 100644 --- a/overlay_client.yaml +++ b/overlay_client.yaml @@ -38,7 +38,7 @@ actions: "split_pdf_allow_failed": { "title": "Split Pdf Allow Failed", - "description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.", + "description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.", "type": "boolean", "default": false, } diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 898fad70..205d8d71 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -89,7 +89,7 @@ class PartitionParametersTypedDict(TypedDict): skip_infer_table_types: NotRequired[List[str]] r"""The document types that you want to skip table extraction with. Default: []""" split_pdf_allow_failed: NotRequired[bool] - r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.""" split_pdf_concurrency_level: NotRequired[int] r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page: NotRequired[bool] @@ -152,7 +152,7 @@ class PartitionParameters(BaseModel): skip_infer_table_types: Annotated[Optional[List[str]], FieldMetadata(multipart=True)] = None r"""The document types that you want to skip table extraction with. Default: []""" split_pdf_allow_failed: Annotated[Optional[bool], FieldMetadata(multipart=True)] = False - r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.""" split_pdf_concurrency_level: Annotated[Optional[int], FieldMetadata(multipart=True)] = 5 r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page: Annotated[Optional[bool], FieldMetadata(multipart=True)] = True