-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
199 lines (171 loc) · 7.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from fastapi import FastAPI, File, Form, UploadFile, HTTPException
from fastapi.responses import PlainTextResponse
from fastapi.responses import FileResponse
from tempfile import mkdtemp
import asyncio
import pypandoc
import logging
import os
import subprocess
import sys
from docling_core.types.doc.base import ImageRefMode
app = FastAPI()
logging.basicConfig(level=logging.INFO)
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
def init_docling_converter(extract_images: bool = True):
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
RapidOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
pipeline_options = PdfPipelineOptions()
ocr_options = RapidOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options
pipeline_options.generate_page_images = extract_images
return DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
converter = None # 将在每次请求时根据参数初始化
@app.get("/", response_class=PlainTextResponse)
async def root():
return "hello, world!"
@app.get("/ping", response_class=PlainTextResponse)
async def pong():
return "pong"
@app.post("/convert")
async def convert(
from_format: str = Form(...),
to_format: str = Form(...),
file: UploadFile = File(None),
content: str = Form(None),
use_ocr: bool = Form(False),
extract_images: bool = Form(True)
):
# print("收到请求 from_format: ", from_format, " to_format: ", to_format , " file: ", file, " content: ", content)
logging.info(f"收到请求 from_format: {from_format}, to_format: {to_format}, file: {file}, content: {content}, use_ocr: {use_ocr}, extract_images: {extract_images}")
global converter
converter = init_docling_converter(extract_images)
if to_format == "pdf":
raise HTTPException(status_code=400, detail="Conversion to pdf is not supported")
if file is None and content is None:
raise HTTPException(status_code=400, detail="Either a file or content must be provided.")
tmpdir = mkdtemp()
input_path = os.path.join(tmpdir, f'input.{from_format}')
intermediate_path = os.path.join(tmpdir, f'intermediate.docx')
output_path = os.path.join(tmpdir, f'output.{to_format}')
try:
await save_uploaded_file(file, content, input_path)
if from_format == "pdf":
if use_ocr and to_format == "html":
await convert_pdf_with_docling(input_path, output_path)
logging.info("Docling OCR转换成功")
final_path = output_path
else:
await convert_pdf_to_docx(input_path, intermediate_path)
logging.info("PDF转换成功")
if to_format == "docx":
final_path = intermediate_path
else:
await convert_with_pandoc(from_format, intermediate_path, to_format, output_path)
logging.info("Pandoc转换成功")
final_path = output_path
else:
await convert_with_pandoc(from_format, input_path, to_format, output_path)
logging.info("Pandoc转换成功")
final_path = output_path
response = FileResponse(path=final_path, filename=f'converted.{to_format}', media_type='application/octet-stream')
asyncio.create_task(delete_files_async([input_path, intermediate_path, output_path], 60)) # Delay in seconds
return response
except Exception as e:
asyncio.create_task(delete_files_async([input_path, intermediate_path, output_path], 0)) # Immediate cleanup
logging.error(f"Error during conversion: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def save_uploaded_file(file, content, path):
if file:
content = await file.read()
elif content:
content = content.encode()
with open(path, 'wb') as f:
f.write(content)
async def convert_pdf_to_docx(input_path, output_path):
# 设置超时时间(秒)
timeout = 60
# 使用子进程执行转换,这样可以完全控制进程生命周期
process = None
try:
cmd = [
sys.executable,
"-c",
f"""
import sys
from pdf2docx import Converter
import traceback
try:
cv = Converter('{input_path}')
cv.convert('{output_path}')
cv.close()
except Exception as e:
print(f"转换错误: {{type(e).__name__}}: {{str(e)}}", file=sys.stderr)
sys.exit(1)
"""
]
# 启动进程
process = subprocess.Popen(
cmd,
stdout=None,
stderr=None,
text=True
)
# 使用asyncio在事件循环中等待进程完成,并设置超时
try:
loop = asyncio.get_event_loop()
await asyncio.wait_for(
loop.run_in_executor(None, process.communicate),
timeout=timeout
)
# 检查进程返回码
if process.returncode != 0:
raise Exception(f"PDF转换失败,返回码: {process.returncode}")
except asyncio.TimeoutError:
if process.poll() is None:
process.kill()
raise Exception(f"PDF转换超时,超过了{timeout}秒的限制")
except Exception as e:
# 确保进程被终止
if process and process.poll() is None:
process.kill()
logging.error(f"PDF转换出错: {e}")
raise
async def convert_with_pandoc(from_format, input_path, to_format, output_path):
if from_format == 'txt' and to_format == 'html':
with open(input_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
text = ''.join(line.strip() + ' \n' for line in lines)
output = pypandoc.convert_text(text, 'html', format='markdown')
with open(output_path, 'w', encoding='utf-8') as file:
file.write(output)
else:
pdoc_args = ["--embed-resources", "--request-header", "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"]
pypandoc.convert_file(input_path, to_format, outputfile=output_path, extra_args=pdoc_args)
async def convert_pdf_with_docling(input_path, output_path):
try:
output_dir = os.path.dirname(output_path)
os.makedirs(output_dir, exist_ok=True)
doc = converter.convert(input_path).document
content = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
content = content.replace("Powered by Docling", "")
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
except Exception as e:
logging.error(f"Docling转换出错: {e}")
raise
async def delete_files_async(file_paths, delay):
await asyncio.sleep(delay)
for path in file_paths:
if os.path.exists(path):
os.remove(path)