diff --git a/lib/pdf/mineru-cloud.ts b/lib/pdf/mineru-cloud.ts new file mode 100644 index 00000000..3d1a34e0 --- /dev/null +++ b/lib/pdf/mineru-cloud.ts @@ -0,0 +1,339 @@ +/** + * MinerU Precision Parsing API (v4) + * https://mineru.net/api/v4 + * + * Flow: POST /file-urls/batch → PUT presigned URL → poll /extract-results/batch/{id} → download ZIP + */ + +import JSZip from 'jszip'; +import type { PDFParserConfig } from './types'; +import type { ParsedPdfContent } from '@/lib/types/pdf'; +import { createLogger } from '@/lib/logger'; +import { extractMinerUResult } from './mineru-parser'; + +const log = createLogger('MinerUCloudV4'); + +export const MINERU_CLOUD_V4_API_BASE = 'https://mineru.net/api/v4'; + +const TIMEOUTS = { + batch: 60_000, + upload: 180_000, + poll: 30_000, + zip: 180_000, +} as const; + +const POLL_INTERVAL_MS = 2500; +const POLL_MAX_MS = 15 * 60 * 1000; + +const MIME_MAP: Record = { + png: 'image/png', + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + webp: 'image/webp', + gif: 'image/gif', +}; + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +function extToMime(ext: string): string { + return MIME_MAP[ext.toLowerCase()] ?? 'application/octet-stream'; +} + +function urlPathForLog(url: string): string { + try { + const { hostname, pathname } = new URL(url); + return `${hostname}${pathname}`; + } catch { + return url; + } +} + +function isRetryable(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message.toLowerCase(); + return ['fetch failed', 'econnreset', 'etimedout', 'timeout', 'aborted', 'enotfound'].some((s) => + msg.includes(s), + ); +} + +async function fetchWithRetry(fn: () => Promise, context: string, attempts = 4): Promise { + let lastErr: unknown; + + for (let i = 1; i <= attempts; i++) { + try { + return await fn(); + } catch (err) { + lastErr = err; + if (!isRetryable(err) || i === attempts) break; + log.warn(`[MinerU v4] ${context} — retry ${i}/${attempts}:`, err); + await sleep(400 * i); + } + } + + const msg = lastErr instanceof Error ? lastErr.message : String(lastErr); + throw new Error(`MinerU v4 ${context} failed: ${msg}`); +} + +// ── API response helpers ────────────────────────────────────────────────────── + +interface MinerUEnvelope { + code: number; + msg: string; + trace_id?: string; + data: T; +} + +function unwrapMinerUResponse(json: MinerUEnvelope, context: string): T { + if (json.code !== 0) { + throw new Error(`MinerU ${context}: ${json.msg || 'unknown error'} (code ${json.code})`); + } + return json.data; +} + +async function readMinerUJson(res: Response, context: string): Promise { + const text = await res.text(); + + let json: MinerUEnvelope; + try { + json = JSON.parse(text) as MinerUEnvelope; + } catch { + throw new Error(`MinerU ${context}: invalid JSON (HTTP ${res.status}): ${text.slice(0, 500)}`); + } + + if (!res.ok) { + throw new Error(`MinerU ${context}: HTTP ${res.status} — ${json.msg || text.slice(0, 300)}`); + } + + return unwrapMinerUResponse(json, context); +} + +// ── Filename sanitization ───────────────────────────────────────────────────── + +export function sanitizePdfFileNameForMinerU(name: string | undefined): string { + const fallback = 'document.pdf'; + const raw = (name ?? fallback).split(/[/\\]/).pop()?.trim() ?? fallback; + const trimmed = raw.slice(0, 240); + + if (!trimmed.toLowerCase().endsWith('.pdf')) return fallback; + if (trimmed.includes('..') || trimmed.includes('/') || trimmed.includes('\\')) return fallback; + + return trimmed || fallback; +} + +export function isMinerUCloudV4BaseUrl(baseUrl: string): boolean { + try { + const { hostname, pathname } = new URL(baseUrl.trim()); + return hostname.toLowerCase() === 'mineru.net' && pathname.replace(/\/+$/, '') === '/api/v4'; + } catch { + return false; + } +} + +// ── ZIP parsing ─────────────────────────────────────────────────────────────── + +type BatchExtractRow = { + file_name?: string; + state?: string; + full_zip_url?: string; + err_msg?: string; +}; + +async function parseMinerUZip(zipUrl: string): Promise { + log.info(`[MinerU v4] Downloading result ZIP: ${urlPathForLog(zipUrl)}`); + + const zipRes = await fetchWithRetry( + () => fetch(zipUrl, { signal: AbortSignal.timeout(TIMEOUTS.zip) }), + 'ZIP download', + ); + + if (!zipRes.ok) { + const text = await zipRes.text().catch(() => zipRes.statusText); + throw new Error(`MinerU ZIP download failed (${zipRes.status}): ${text.slice(0, 300)}`); + } + + const zipBuf = Buffer.from(await zipRes.arrayBuffer()); + let zip: Awaited>; + try { + zip = await JSZip.loadAsync(zipBuf); + } catch (e) { + throw new Error(`MinerU ZIP parse failed: ${e instanceof Error ? e.message : String(e)}`); + } + + const filePaths = Object.keys(zip.files).filter((p) => !zip.files[p].dir); + const fullMdPath = filePaths.find((p) => /(^|\/)full\.md$/i.test(p)); + const contentListPath = filePaths.find( + (p) => p.endsWith('_content_list.json') || /(^|\/)content_list\.json$/i.test(p), + ); + + if (!fullMdPath) throw new Error('MinerU ZIP: full.md not found'); + + const mdContent = await zip.file(fullMdPath)!.async('string'); + const dirPrefix = fullMdPath.includes('/') + ? fullMdPath.slice(0, fullMdPath.lastIndexOf('/') + 1) + : ''; + + let contentList: unknown; + if (contentListPath) { + const raw = await zip.file(contentListPath)!.async('string'); + try { + contentList = JSON.parse(raw); + } catch { + log.warn('[MinerU v4] content_list JSON parse failed, continuing with markdown only'); + } + } + + async function readImage(relPath: string): Promise { + const normalized = relPath.replace(/^\.?\//, ''); + for (const candidate of [dirPrefix + normalized, normalized]) { + const entry = zip.file(candidate); + if (!entry) continue; + const buf = await entry.async('nodebuffer'); + const ext = candidate.split('.').pop() ?? 'png'; + return `data:${extToMime(ext)};base64,${buf.toString('base64')}`; + } + log.warn(`[MinerU v4] Image not found in ZIP: ${relPath}`); + return null; + } + + const imageData: Record = {}; + if (Array.isArray(contentList)) { + for (const item of contentList as Array>) { + const imgPath = item.img_path; + if (typeof imgPath === 'string' && imgPath) { + const dataUrl = await readImage(imgPath); + if (dataUrl) imageData[imgPath] = dataUrl; + } + } + } + + return extractMinerUResult({ + md_content: mdContent, + images: imageData, + content_list: contentList, + }); +} + +// ── Main export ─────────────────────────────────────────────────────────────── + +/** + * Upload a PDF via MinerU cloud v4 presigned URLs, poll until done, return parsed content. + */ +export async function parseWithMinerUCloudV4( + config: PDFParserConfig, + pdfBuffer: Buffer, + uploadFileName: string, +): Promise { + const apiRoot = config.baseUrl!.trim().replace(/\/+$/, ''); + const token = config.apiKey!.trim(); + const modelVersion = config.mineruModelVersion === 'pipeline' ? 'pipeline' : 'vlm'; + + const authHeaders = { + Authorization: `Bearer ${token}`, + 'Content-Type': 'application/json', + Accept: 'application/json', + }; + + // Step 1: Create batch and get presigned upload URL + log.info(`[MinerU v4] Creating batch for "${uploadFileName}" (model: ${modelVersion})`); + + const batchData = await fetchWithRetry(async () => { + const res = await fetch(`${apiRoot}/file-urls/batch`, { + method: 'POST', + headers: authHeaders, + body: JSON.stringify({ + files: [{ name: uploadFileName }], + model_version: modelVersion, + enable_formula: true, + enable_table: true, + }), + signal: AbortSignal.timeout(TIMEOUTS.batch), + }); + return readMinerUJson<{ batch_id: string; file_urls?: string[]; files?: string[] }>( + res, + 'file-urls/batch', + ); + }, 'create batch'); + + const uploadUrls = batchData.file_urls ?? batchData.files; + if (!batchData.batch_id || !uploadUrls?.length) { + throw new Error('MinerU batch response missing batch_id or upload URLs'); + } + + // Step 2: Upload PDF to presigned URL + log.info( + `[MinerU v4] Uploading ${pdfBuffer.byteLength} bytes to ${urlPathForLog(uploadUrls[0])}`, + ); + + const putRes = await fetchWithRetry( + () => + fetch(uploadUrls[0], { + method: 'PUT', + body: new Uint8Array(pdfBuffer), + signal: AbortSignal.timeout(TIMEOUTS.upload), + redirect: 'manual', + // No Content-Type — presigned OSS URLs are sensitive to headers in the signature + }), + 'presigned upload', + 5, + ); + + if (!putRes.ok) { + const text = await putRes.text().catch(() => putRes.statusText); + throw new Error(`MinerU upload failed (${putRes.status}): ${text.slice(0, 400)}`); + } + + // Give the backend a moment to register the upload + await sleep(1500); + + // Step 3: Poll for completion + const deadline = Date.now() + POLL_MAX_MS; + let lastState = ''; + + while (Date.now() < deadline) { + const statusData = await fetchWithRetry( + async () => { + log.debug?.(`[MinerU v4] Polling batch ${batchData.batch_id}`); + const res = await fetch(`${apiRoot}/extract-results/batch/${batchData.batch_id}`, { + headers: { Authorization: `Bearer ${token}`, Accept: 'application/json' }, + signal: AbortSignal.timeout(TIMEOUTS.poll), + }); + return readMinerUJson<{ extract_result?: BatchExtractRow | BatchExtractRow[] }>( + res, + 'extract-results/batch', + ); + }, + 'poll batch', + 3, + ); + + const rows = statusData.extract_result; + const list: BatchExtractRow[] = Array.isArray(rows) ? rows : rows ? [rows] : []; + const row = + list.find((r) => r.file_name === uploadFileName) || + list.find((r) => r.file_name?.toLowerCase() === uploadFileName.toLowerCase()) || + list[0]; + + if (!row?.state) { + log.warn('[MinerU v4] Poll returned no result row yet'); + await sleep(POLL_INTERVAL_MS); + continue; + } + + if (row.state !== lastState) { + lastState = row.state; + log.info(`[MinerU v4] Batch ${batchData.batch_id} → ${row.state}`); + } + + if (row.state === 'failed') { + throw new Error(`MinerU parsing failed: ${row.err_msg || 'unknown error'}`); + } + + if (row.state === 'done' && row.full_zip_url) { + return parseMinerUZip(row.full_zip_url); + } + + await sleep(POLL_INTERVAL_MS); + } + + throw new Error(`MinerU timed out after ${POLL_MAX_MS / 1000}s (batch: ${batchData.batch_id})`); +} diff --git a/lib/pdf/mineru-parser.ts b/lib/pdf/mineru-parser.ts new file mode 100644 index 00000000..c5255286 --- /dev/null +++ b/lib/pdf/mineru-parser.ts @@ -0,0 +1,123 @@ +/** + * MinerU result parser + * Used by both self-hosted and cloud v4 paths. + */ + +import type { ParsedPdfContent } from '@/lib/types/pdf'; +import { createLogger } from '@/lib/logger'; + +const log = createLogger('MinerUResult'); + +type ImageMeta = { + pageIdx: number; + bbox: number[]; + caption?: string; +}; + +type ContentItem = Record; + +function parseContentList(raw: unknown): ContentItem[] | null { + if (Array.isArray(raw)) return raw as ContentItem[]; + + if (typeof raw === 'string') { + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : null; + } catch { + log.warn('[MinerU] content_list is not valid JSON, skipping layout metadata'); + return null; + } + } + + return null; +} + +function normalizeImageData(raw: unknown): Record { + if (!raw || typeof raw !== 'object') return {}; + + return Object.fromEntries( + Object.entries(raw as Record).map(([key, value]) => [ + key, + value.startsWith('data:') ? value : `data:image/png;base64,${value}`, + ]), + ); +} + +function buildImageMetaLookup(contentList: ContentItem[]): Map { + const lookup = new Map(); + + for (const item of contentList) { + if (item.type !== 'image' || !item.img_path) continue; + + const meta: ImageMeta = { + pageIdx: (item.page_idx as number) ?? 0, + bbox: (item.bbox as number[]) || [0, 0, 1000, 1000], + caption: Array.isArray(item.image_caption) ? (item.image_caption[0] as string) : undefined, + }; + + const imgPath = item.img_path as string; + lookup.set(imgPath, meta); + + // Also index by basename so we can match `images/foo.png` → `foo.png` + const basename = imgPath.split('/').pop(); + if (basename && basename !== imgPath) { + lookup.set(basename, meta); + } + } + + return lookup; +} + +/** + * Normalize MinerU API / ZIP output into ParsedPdfContent. + * Used by both self-hosted and cloud v4 paths. + */ +export function extractMinerUResult(fileResult: Record): ParsedPdfContent { + const markdown = (fileResult.md_content as string) || ''; + const imageData = normalizeImageData(fileResult.images); + const contentList = parseContentList(fileResult.content_list); + + const pageCount = contentList + ? new Set(contentList.map((item) => item.page_idx).filter((v) => v != null)).size + : 0; + + const metaLookup = contentList ? buildImageMetaLookup(contentList) : new Map(); + + const imageMapping: Record = {}; + const pdfImages: Array<{ + id: string; + src: string; + pageNumber: number; + description?: string; + width?: number; + height?: number; + }> = []; + + for (const [index, [key, base64Url]] of Object.entries(imageData).entries()) { + const imageId = key.startsWith('img_') ? key : `img_${index + 1}`; + const meta = metaLookup.get(key) ?? metaLookup.get(`images/${key}`); + + imageMapping[imageId] = base64Url; + pdfImages.push({ + id: imageId, + src: base64Url, + pageNumber: meta ? meta.pageIdx + 1 : 0, + description: meta?.caption, + width: meta ? meta.bbox[2] - meta.bbox[0] : undefined, + height: meta ? meta.bbox[3] - meta.bbox[1] : undefined, + }); + } + + log.info(`[MinerU] Parsed: ${pdfImages.length} images, ${markdown.length} chars`); + + return { + text: markdown, + images: Object.values(imageMapping), + metadata: { + pageCount, + parser: 'mineru', + imageMapping, + pdfImages, + }, + }; +} diff --git a/lib/pdf/pdf-providers.ts b/lib/pdf/pdf-providers.ts index edfaea06..b3409880 100644 --- a/lib/pdf/pdf-providers.ts +++ b/lib/pdf/pdf-providers.ts @@ -143,6 +143,7 @@ import type { PDFParserConfig } from './types'; import type { ParsedPdfContent } from '@/lib/types/pdf'; import { PDF_PROVIDERS } from './constants'; import { createLogger } from '@/lib/logger'; +import { parseWithMinerUCloudV4, sanitizePdfFileNameForMinerU } from './mineru-cloud'; const log = createLogger('PDFProviders'); @@ -262,6 +263,22 @@ async function parseWithUnpdf(pdfBuffer: Buffer): Promise { }; } + +// Return MinerU v4 API base URL ("https://mineru.net/api/v4") if baseUrl is a valid mineru.net (cloud) endpoint, else null +function getMinerUCloudApiBase(baseUrl: string): string | null { + try { + const url = new URL(baseUrl.trim()); + if (url.hostname.toLowerCase() !== 'mineru.net') return null; + const path = url.pathname.replace(/\/+$/, ''); + if (path === '' || path === '/' || path.startsWith('/api/v4')) { + return `${url.origin}/api/v4`; + } + return null; + } catch { + return null; + } +} + /** * Parse PDF using self-hosted MinerU service (mineru-api) * @@ -285,6 +302,24 @@ async function parseWithMinerU( ); } + // Route to cloud v4 if baseUrl points to mineru.net (mineru.net/api/v4) + // Otherwise, route to self-hosted MinerU server + const cloudApiBase = getMinerUCloudApiBase(config.baseUrl); + if (cloudApiBase) { + // MinerU cloud Precision API v4 has an upload size limit (200MB for single files). + const MAX_BYTES = 200 * 1024 * 1024; + if (pdfBuffer.byteLength > MAX_BYTES) { + const sizeMb = (pdfBuffer.byteLength / (1024 * 1024)).toFixed(1); + throw new Error(`MinerU cloud: file too large (${sizeMb}MB, max 200MB)`); + } + if (!config.apiKey?.trim()) { + throw new Error('MinerU cloud (mineru.net) requires an API token'); + } + const uploadName = sanitizePdfFileNameForMinerU(config.sourceFileName); + log.info('[MinerU] Using cloud v4 API:', cloudApiBase, 'file:', uploadName); + return parseWithMinerUCloudV4({ ...config, baseUrl: cloudApiBase }, pdfBuffer, uploadName); + } + log.info('[MinerU] Parsing PDF with MinerU server:', config.baseUrl); const fileName = 'document.pdf'; @@ -460,4 +495,4 @@ export async function getCurrentPDFConfig(): Promise { } // Re-export from constants for convenience -export { getAllPDFProviders, getPDFProvider } from './constants'; +export { getAllPDFProviders, getPDFProvider } from './constants'; \ No newline at end of file diff --git a/lib/pdf/types.ts b/lib/pdf/types.ts index 8173daed..cd89135d 100644 --- a/lib/pdf/types.ts +++ b/lib/pdf/types.ts @@ -29,3 +29,10 @@ export interface PDFParserConfig { } // Note: ParsedPdfContent is imported from @/lib/types/pdf to avoid duplication +export interface PDFParserConfig { + providerId: PDFProviderId; + apiKey?: string; + baseUrl?: string; + sourceFileName?: string; + mineruModelVersion?: 'pipeline' | 'vlm'; +} \ No newline at end of file