@@ -6,9 +6,12 @@ import * as Core from '../core';
66
77export class Parse extends APIResource {
88 /**
9- * Parse a file into a structured Markdown representation. The file size must be
10- * less than 100MB and the number of pages must be less than 400.
9+ * Parse a file into a structured Markdown and/or JSON. Files must be less than
10+ * 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
11+ * PDF, which may affect page count.
1112 *
13+ * See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
14+ * [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
1215 * Email [parse-feedback@contextual.ai](mailto:parse-feedback@contextual.ai) with
1316 * any feedback or questions.
1417 */
@@ -92,6 +95,11 @@ export interface ParseJobResultsResponse {
9295 */
9396 status : 'pending' | 'processing' | 'retrying' | 'completed' | 'failed' | 'cancelled' ;
9497
98+ /**
99+ * Document-level metadata parsed from the document
100+ */
101+ document_metadata ?: ParseJobResultsResponse . DocumentMetadata ;
102+
95103 /**
96104 * The parsed, structured Markdown of the input file. Only present if
97105 * `markdown-document` was among the requested output types.
@@ -103,137 +111,143 @@ export interface ParseJobResultsResponse {
103111 * requested) and/or per-page `ParsedBlock`s (if `blocks-per-page` was requested).
104112 */
105113 pages ?: Array < ParseJobResultsResponse . Page > ;
106-
107- /**
108- * The table of contents representing the document's heading hierarchy. Only
109- * present if `enable_document_hierarchy` was set to true in the parse request.
110- */
111- table_of_contents ?: ParseJobResultsResponse . TableOfContents ;
112114}
113115
114116export namespace ParseJobResultsResponse {
115117 /**
116- * Per-page parse results.
118+ * Document-level metadata parsed from the document
117119 */
118- export interface Page {
120+ export interface DocumentMetadata {
119121 /**
120- * The index of the parsed page (zero-indexed)
122+ * Hierarchy of the document, as both heading blocks and a markdown table of
123+ * contents
121124 */
122- index : number ;
123-
124- /**
125- * The parsed, structured blocks of this page. Present if `blocks-per-page` was
126- * among the requested output types.
127- */
128- blocks ?: Array < Page . Block > ;
129-
130- /**
131- * The parsed, structured Markdown of this page. Present if `markdown-per-page` was
132- * among the requested output types.
133- */
134- markdown ?: string ;
125+ hierarchy ?: DocumentMetadata . Hierarchy ;
135126 }
136127
137- export namespace Page {
128+ export namespace DocumentMetadata {
138129 /**
139- * One logical block of content from a parsed page.
130+ * Hierarchy of the document, as both heading blocks and a markdown table of
131+ * contents
140132 */
141- export interface Block {
133+ export interface Hierarchy {
142134 /**
143- * Unique ID of the block
135+ * Heading blocks which define the hierarchy of the document
144136 */
145- id : string ;
137+ blocks ?: Array < Hierarchy . Block > ;
146138
147139 /**
148- * The normalized bounding box of the block, as relative percentages of the page
149- * width and height
140+ * Markdown representation of the table of contents for this document
150141 */
151- bounding_box : Block . BoundingBox ;
152-
153- /**
154- * The Markdown representation of the block
155- */
156- markdown : string ;
142+ table_of_contents ?: string ;
143+ }
157144
145+ export namespace Hierarchy {
158146 /**
159- * The type of the block
147+ * One logical block of content from a parsed page.
160148 */
161- type : 'heading' | 'text' | 'table' | 'figure' ;
149+ export interface Block {
150+ /**
151+ * Unique ID of the block
152+ */
153+ id : string ;
162154
163- /**
164- * The confidence level of this block categorized as 'low', 'medium', or 'high'.
165- * Only available for blocks of type 'table' currently.
166- */
167- confidence_level ?: 'low' | 'medium' | 'high' ;
155+ /**
156+ * The normalized bounding box of the block, as relative percentages of the page
157+ * width and height
158+ */
159+ bounding_box : Block . BoundingBox ;
168160
169- /**
170- * The level of the block in the document hierarchy, starting at 0 for the
171- * root-level title block. Only present if `enable_document_hierarchy` was set to
172- * true in the request.
173- */
174- hierarchy_level ?: number ;
161+ /**
162+ * The Markdown representation of the block
163+ */
164+ markdown : string ;
175165
176- /**
177- * The page (0-indexed) that this block belongs to. Only set for heading blocks
178- * that are returned in the table of contents.
179- */
180- page_index ?: number ;
166+ /**
167+ * The type of the block
168+ */
169+ type : 'heading' | 'text' | 'table' | 'figure' ;
181170
182- /**
183- * The IDs of the parent in the document hierarchy, sorted from root-level to
184- * bottom. For root-level heading blocks, this will be an empty list. Only present
185- * if `enable_document_hierarchy` was set to true in the request.
186- */
187- parent_ids ?: Array < string > ;
188- }
171+ /**
172+ * The confidence level of this block categorized as 'low', 'medium', or 'high'.
173+ * Only available for blocks of type 'table' currently.
174+ */
175+ confidence_level ?: 'low' | 'medium' | 'high' ;
189176
190- export namespace Block {
191- /**
192- * The normalized bounding box of the block, as relative percentages of the page
193- * width and height
194- */
195- export interface BoundingBox {
196177 /**
197- * The x-coordinate of the top-left corner of the bounding box
178+ * The level of the block in the document hierarchy, starting at 0 for the
179+ * root-level title block. Only present if `enable_document_hierarchy` was set to
180+ * true in the request.
198181 */
199- x0 : number ;
182+ hierarchy_level ? : number ;
200183
201184 /**
202- * The x-coordinate of the bottom-right corner of the bounding box
185+ * The page (0-indexed) that this block belongs to. Only set for heading blocks
186+ * that are returned in the table of contents.
203187 */
204- x1 : number ;
188+ page_index ? : number ;
205189
206190 /**
207- * The y-coordinate of the top-left corner of the bounding box
191+ * The IDs of the parent in the document hierarchy, sorted from root-level to
192+ * bottom. For root-level heading blocks, this will be an empty list. Only present
193+ * if `enable_document_hierarchy` was set to true in the request.
208194 */
209- y0 : number ;
195+ parent_ids ?: Array < string > ;
196+ }
210197
198+ export namespace Block {
211199 /**
212- * The y-coordinate of the bottom-right corner of the bounding box
200+ * The normalized bounding box of the block, as relative percentages of the page
201+ * width and height
213202 */
214- y1 : number ;
203+ export interface BoundingBox {
204+ /**
205+ * The x-coordinate of the top-left corner of the bounding box
206+ */
207+ x0 : number ;
208+
209+ /**
210+ * The x-coordinate of the bottom-right corner of the bounding box
211+ */
212+ x1 : number ;
213+
214+ /**
215+ * The y-coordinate of the top-left corner of the bounding box
216+ */
217+ y0 : number ;
218+
219+ /**
220+ * The y-coordinate of the bottom-right corner of the bounding box
221+ */
222+ y1 : number ;
223+ }
215224 }
216225 }
217226 }
218227
219228 /**
220- * The table of contents representing the document's heading hierarchy. Only
221- * present if `enable_document_hierarchy` was set to true in the parse request.
229+ * Per-page parse results.
222230 */
223- export interface TableOfContents {
231+ export interface Page {
232+ /**
233+ * The index of the parsed page (zero-indexed)
234+ */
235+ index : number ;
236+
224237 /**
225- * Heading blocks that define the hierarchy of the document
238+ * The parsed, structured blocks of this page. Present if `blocks-per-page` was
239+ * among the requested output types.
226240 */
227- blocks ?: Array < TableOfContents . Block > ;
241+ blocks ?: Array < Page . Block > ;
228242
229243 /**
230- * Markdown representation of the table of contents that can be pre-pended to the
231- * markdown document .
244+ * The parsed, structured Markdown of this page. Present if `markdown-per-page` was
245+ * among the requested output types .
232246 */
233247 markdown ?: string ;
234248 }
235249
236- export namespace TableOfContents {
250+ export namespace Page {
237251 /**
238252 * One logical block of content from a parsed page.
239253 */
@@ -372,43 +386,43 @@ export interface ParseCreateParams {
372386 raw_file : Core . Uploadable ;
373387
374388 /**
375- * Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
376- * table of contents to the output with the structure of the entire parsed
377- * document . Not permitted in ' basic' parsing_mode, or if page_range is not
378- * continuous and/or does not start from page zero.
389+ * Adds a table of contents to the output with the structure of the entire parsed
390+ * document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
391+ * H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
392+ * is not continuous and/or does not start from page zero.
379393 */
380394 enable_document_hierarchy ?: boolean ;
381395
382396 /**
383397 * Controls whether tables are split into multiple tables by row with the headers
384398 * propagated. Use for improving LLM comprehension of very large tables. Not
385- * permitted in ' basic' parsing_mode.
399+ * permitted in ` basic` parsing_mode.
386400 */
387401 enable_split_tables ?: boolean ;
388402
389403 /**
390- * Controls how thorough figure captions are. ' concise' is short and minimizes
391- * chances of hallucinations. ' detailed' is more thorough and can include
392- * commentary. Not permitted in ' basic' parsing_mode.
404+ * Controls how thorough figure captions are. ` concise` is short and minimizes
405+ * chances of hallucinations. ` detailed` is more thorough and can include
406+ * commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
393407 */
394408 figure_caption_mode ?: 'concise' | 'detailed' ;
395409
396410 /**
397411 * Threshold number of table cells beyond which large tables are split if
398- * `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
412+ * `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
399413 */
400414 max_split_table_cells ?: number ;
401415
402416 /**
403417 * Optional string representing page range to be parsed. Format: comma-separated
404- * indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
405- * ' 0-2,5,6'
418+ * indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
419+ * ` 0-2,5,6`)
406420 */
407421 page_range ?: string ;
408422
409423 /**
410- * The settings to use for parsing. ' basic' is for simple, text-only documents.
411- * ' standard' is for complex documents with images, complex hierarchy, and/or no
424+ * The settings to use for parsing. ` basic` is for simple, text-only documents.
425+ * ` standard` is for complex documents with images, complex hierarchy, and/or no
412426 * natively encoded textual data (e.g. for scanned documents).
413427 */
414428 parse_mode ?: 'basic' | 'standard' ;
@@ -417,11 +431,11 @@ export interface ParseCreateParams {
417431export interface ParseJobResultsParams {
418432 /**
419433 * The desired output format(s) of the parsed file. Must be `markdown-document`,
420- * `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
421- * whole document into a single concatenated markdown output. `markdown-per-page`
422- * provides markdown output per page. `blocks-per-page` provides a structured JSON
434+ * `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
435+ * multiple formats in the response. `markdown-document` parses the whole document
436+ * into a single concatenated markdown output. `markdown-per-page` provides
437+ * markdown output per page. `blocks-per-page` provides a structured JSON
423438 * representation of the content blocks on each page, sorted by reading order.
424- * Specify multiple values to get multiple formats in the response.
425439 */
426440 output_types ?: Array < 'markdown-document' | 'markdown-per-page' | 'blocks-per-page' > ;
427441}
0 commit comments