|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import fitz #pip install pymupdf |
| 4 | +import re |
| 5 | +import os |
| 6 | + |
| 7 | + |
| 8 | +def find_imag(path,img_path): |
| 9 | + |
| 10 | + checkXO = r"/Type(?= */XObject)" |
| 11 | + checkIM = r"/Subtype(?= */Image)" |
| 12 | + |
| 13 | + pdf = fitz.open(path) |
| 14 | + |
| 15 | + img_count = 0 |
| 16 | + len_XREF = pdf._getXrefLength() |
| 17 | + |
| 18 | + print("文件名:{}, 页数: {}, 对象: {}".format(path, len(pdf), len_XREF - 1)) |
| 19 | + |
| 20 | + for i in range(1, len_XREF): |
| 21 | + text = pdf._getXrefString(i) |
| 22 | + isXObject = re.search(checkXO, text) |
| 23 | + |
| 24 | + # 使用正则表达式查看是否是图片 |
| 25 | + isImage = re.search(checkIM, text) |
| 26 | + |
| 27 | + # 如果不是对象也不是图片,则continue |
| 28 | + if not isXObject or not isImage: |
| 29 | + continue |
| 30 | + img_count += 1 |
| 31 | + # 根据索引生成图像 |
| 32 | + pix = fitz.Pixmap(pdf, i) |
| 33 | + |
| 34 | + new_name = path.replace('\\', '_') + "_img{}.png".format(img_count) |
| 35 | + new_name = new_name.replace(':', '') |
| 36 | + |
| 37 | + # 如果pix.n<5,可以直接存为PNG |
| 38 | + if pix.n < 5: |
| 39 | + pix.writePNG(os.path.join(img_path, new_name)) |
| 40 | + |
| 41 | + else: |
| 42 | + pix0 = fitz.Pixmap(fitz.csRGB, pix) |
| 43 | + pix0.writePNG(os.path.join(img_path, new_name)) |
| 44 | + pix0 = None |
| 45 | + |
| 46 | + pix = None |
| 47 | + |
| 48 | + print("提取了{}张图片".format(img_count)) |
| 49 | + |
| 50 | + |
| 51 | +if __name__=='__main__': |
| 52 | + pdf_path = r'test.pdf' |
| 53 | + img_path = r'img' |
| 54 | + m = find_imag(pdf_path, img_path) |
0 commit comments