diff --git a/README.md b/README.md index 4ce3a008..b661f698 100755 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ See [page splitting](https://docs.unstructured.io/api-reference/api-services/sdk In order to speed up processing of large PDF files, the client splits up PDFs into smaller files, sends these to the API concurrently, and recombines the results. `split_pdf_page` can be set to `False` to disable this. The amount of workers utilized for splitting PDFs is dictated by the `split_pdf_concurrency_level` parameter, with a default of 5 and a maximum of 15 to keep resource usage and costs in check. The splitting process leverages `asyncio` to manage concurrency effectively. -The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document. Because the splitting process uses `asyncio` the client can encouter event loop issues if it is nested in another async runner, like running in a `gevent` spawned task. Instead, this is safe to run in multiprocessing workers (e.g., using `multiprocessing.Pool` with `fork` context). +The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document. Because the splitting process uses `asyncio` the client can encounter event loop issues if it is nested in another async runner, like running in a `gevent` spawned task. Instead, this is safe to run in multiprocessing workers (e.g., using `multiprocessing.Pool` with `fork` context). Example: ```python @@ -369,9 +369,9 @@ There are two important files used by `make client-generate`: 1. `openapi.json` which is actually not stored here, [but fetched from unstructured-api](https://api.unstructured.io/general/openapi.json), represents the API that is supported on backend. 2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json` which is used to generate SDK. -Once PR with changes is merged, Github CI will autogenerate the Speakeasy client in a new PR, using +Once PR with changes is merged, GitHub CI will autogenerate the Speakeasy client in a new PR, using the `openapi.json` and `overlay_client.yaml` You will have to manually bring back the human created lines in it. -Feel free to open a PR or a Github issue as a proof of concept and we'll do our best to include it in a future release! +Feel free to open a PR or a GitHub issue as a proof of concept and we'll do our best to include it in a future release! ### SDK Created by [Speakeasy](https://www.speakeasyapi.dev/docs/sdk-design/python/methodology-python) diff --git a/USAGE.md b/USAGE.md index 63ba6954..3d063782 100644 --- a/USAGE.md +++ b/USAGE.md @@ -31,7 +31,7 @@ if res.elements is not None:
-The same SDK client can also be used to make asychronous requests by importing asyncio. +The same SDK client can also be used to make asynchronous requests by importing asyncio. ```python # Asynchronous Example import asyncio diff --git a/overlay_client.yaml b/overlay_client.yaml index f36cdc73..c5e0ad8f 100644 --- a/overlay_client.yaml +++ b/overlay_client.yaml @@ -38,7 +38,7 @@ actions: "split_pdf_allow_failed": { "title": "Split Pdf Allow Failed", - "description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.", + "description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.", "type": "boolean", "default": false, } diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index 54fb06b3..b60d2c0e 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -125,7 +125,7 @@ def get_split_pdf_allow_failed_param( def get_split_pdf_concurrency_level_param( form_data: FormData, key: str, fallback_value: int, max_allowed: int ) -> int: - """Retrieves the value for concurreny level that should be used for splitting pdf. + """Retrieves the value for concurrency level that should be used for splitting pdf. In case given the number is not a valid integer or less than 1, it will use the default value. diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 898fad70..205d8d71 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -89,7 +89,7 @@ class PartitionParametersTypedDict(TypedDict): skip_infer_table_types: NotRequired[List[str]] r"""The document types that you want to skip table extraction with. Default: []""" split_pdf_allow_failed: NotRequired[bool] - r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.""" split_pdf_concurrency_level: NotRequired[int] r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page: NotRequired[bool] @@ -152,7 +152,7 @@ class PartitionParameters(BaseModel): skip_infer_table_types: Annotated[Optional[List[str]], FieldMetadata(multipart=True)] = None r"""The document types that you want to skip table extraction with. Default: []""" split_pdf_allow_failed: Annotated[Optional[bool], FieldMetadata(multipart=True)] = False - r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resulting list of Elements will miss the data from errored pages.""" split_pdf_concurrency_level: Annotated[Optional[int], FieldMetadata(multipart=True)] = 5 r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page: Annotated[Optional[bool], FieldMetadata(multipart=True)] = True diff --git a/src/unstructured_client/utils/queryparams.py b/src/unstructured_client/utils/queryparams.py index 130b31e2..87f0df46 100644 --- a/src/unstructured_client/utils/queryparams.py +++ b/src/unstructured_client/utils/queryparams.py @@ -66,10 +66,10 @@ def _populate_query_params( f_name = field.alias if field.alias is not None else name serialization = metadata.serialization if serialization is not None: - serialized_parms = _get_serialized_params( + serialized_params = _get_serialized_params( metadata, f_name, value, param_field_types[name] ) - for key, value in serialized_parms.items(): + for key, value in serialized_params.items(): if key in query_param_values: query_param_values[key].extend(value) else: