Extract images from PDF without resampling, in python?
如何以原始分辨率和格式从pdf文档中提取所有图像? (意味着将tiff提取为tiff,将jpeg提取为jpeg等,而无需重新采样)。 布局并不重要,我不在乎源图像是否位于页面上。
我使用的是python 2.7,但可以根据需要使用3.x。
在带有PyPDF2和Pillow库的Python中,它很简单:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import PyPDF2 from PIL import Image if __name__ == '__main__': input1 = PyPDF2.PdfFileReader(open("input.pdf","rb")) page0 = input1.getPage(0) xObject = page0['/Resources']['/XObject'].getObject() for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode ="RGB" else: mode ="P" if xObject[obj]['/Filter'] == '/FlateDecode': img = Image.frombytes(mode, size, data) img.save(obj[1:] +".png") elif xObject[obj]['/Filter'] == '/DCTDecode': img = open(obj[1:] +".jpg","wb") img.write(data) img.close() elif xObject[obj]['/Filter'] == '/JPXDecode': img = open(obj[1:] +".jp2","wb") img.write(data) img.close() |
通常以PDF格式将图像按原样存储。例如,插入jpg的PDF在中间的某个位置会有一定范围的字节,而这些字节在提取时是有效的jpg文件。您可以使用它非常简单地从PDF中提取字节范围。我前一段时间写过有关此示例的示例代码:从PDF提取JPG。
在带有用于CCITTFaxDecode过滤器的PyPDF2的Python中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import PyPDF2 import struct """ Links: PDF format: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf CCITT Group 4: https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-T.6-198811-I!!PDF-E&type=items Extract images from pdf: http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python Extract images coded with CCITTFaxDecode in .net: http://stackoverflow.com/questions/2641770/extracting-image-from-pdf-with-ccittfaxdecode-filter TIFF format and tags: http://www.awaresystems.be/imaging/tiff/faq.html """ def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4): tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h' return struct.pack(tiff_header_struct, b'II', # Byte order indication: Little indian 42, # Version number (always 42) 8, # Offset to first IFD 8, # Number of tags in IFD 256, 4, 1, width, # ImageWidth, LONG, 1, width 257, 4, 1, height, # ImageLength, LONG, 1, lenght 258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1 259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding 262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero 273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header 278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght 279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image 0 # last IFD ) pdf_filename = 'scan.pdf' pdf_file = open(pdf_filename, 'rb') cond_scan_reader = PyPDF2.PdfFileReader(pdf_file) for i in range(0, cond_scan_reader.getNumPages()): page = cond_scan_reader.getPage(i) xObject = page['/Resources']['/XObject'].getObject() for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': """ The CCITTFaxDecode filter decodes image data that has been encoded using either Group 3 or Group 4 CCITT facsimile (fax) encoding. CCITT encoding is designed to achieve efficient compression of monochrome (1 bit per pixel) image data at relatively low resolutions, and so is useful only for bitmap image data, not for color images, grayscale images, or general data. K < 0 --- Pure two-dimensional encoding (Group 4) K = 0 --- Pure one-dimensional encoding (Group 3, 1-D) K > 0 --- Mixed one- and two-dimensional encoding (Group 3, 2-D) """ if xObject[obj]['/Filter'] == '/CCITTFaxDecode': if xObject[obj]['/DecodeParms']['/K'] == -1: CCITT_group = 4 else: CCITT_group = 3 width = xObject[obj]['/Width'] height = xObject[obj]['/Height'] data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode img_size = len(data) tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group) img_name = obj[1:] + '.tiff' with open(img_name, 'wb') as img_file: img_file.write(tiff_header + data) # # import io # from PIL import Image # im = Image.open(io.BytesIO(tiff_header + data)) pdf_file.close() |
您可以使用模块PyMuPDF。这会将所有图像输出为.png文件,但是开箱即用且速度很快。
1 2 3 4 5 6 7 8 9 10 11 12 13 | import fitz doc = fitz.open("file.pdf") for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG("p%s-%s.png" % (i, xref)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG("p%s-%s.png" % (i, xref)) pix1 = None pix = None |
在这里查看更多资源
Libpoppler附带了一个名为" pdfimages"的工具,可以完成此任务。
(在ubuntu系统上,它在poppler-utils软件包中)
http://poppler.freedesktop.org/
http://en.wikipedia.org/wiki/Pdfimages
Windows二进制文件:http://blog.alivate.com.au/poppler-windows/
我更喜欢minecart,因为它非常易于使用。以下代码段显示了如何从pdf中提取图像:
1 2 3 4 5 6 7 8 9 10 11 12 | #pip install minecart import minecart pdffile = open('Invoices.pdf', 'rb') doc = minecart.Document(pdffile) page = doc.get_page(0) # getting a single page #iterating through all pages for page in doc.iter_pages(): im = page.images[0].as_pil() # requires pillow display(im) |
我从@sylvain的代码开始
存在一些缺陷,例如getData的异常
有我的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | import PyPDF2 from PIL import Image import sys from os import path import warnings warnings.filterwarnings("ignore") number = 0 def recurse(page, xObject): global number xObject = xObject['/Resources']['/XObject'].getObject() for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj]._data if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode ="RGB" else: mode ="P" imagename ="%s - p. %s - %s"%(abspath[:-4], p, obj[1:]) if xObject[obj]['/Filter'] == '/FlateDecode': img = Image.frombytes(mode, size, data) img.save(imagename +".png") number += 1 elif xObject[obj]['/Filter'] == '/DCTDecode': img = open(imagename +".jpg","wb") img.write(data) img.close() number += 1 elif xObject[obj]['/Filter'] == '/JPXDecode': img = open(imagename +".jp2","wb") img.write(data) img.close() number += 1 else: recurse(page, xObject[obj]) try: _, filename, *pages = sys.argv *pages, = map(int, pages) abspath = path.abspath(filename) except BaseException: print('Usage : PDF_extract_images file.pdf page1 page2 page3 …') sys.exit() file = PyPDF2.PdfFileReader(open(filename,"rb")) for p in pages: page0 = file.getPage(p-1) recurse(p, page0) print('%s extracted images'% number) |
经过一番搜索,我发现以下脚本非常适合我的PDF。它只能处理JPG,但可以与我不受保护的文件完美配合。也是不需要任何外部库的。
不客气,该脚本源自Ned Batchelder,而不是我。
Python3代码:从pdf中提取jpg。又快又脏
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | import sys with open(sys.argv[1],"rb") as file: file.seek(0) pdf = file.read() startmark = b"\xff\xd8" startfix = 0 endmark = b"\xff\xd9" endfix = 2 i = 0 njpg = 0 while True: istream = pdf.find(b"stream", i) if istream < 0: break istart = pdf.find(startmark, istream, istream + 20) if istart < 0: i = istream + 20 continue iend = pdf.find(b"endstream", istart) if iend < 0: raise Exception("Didn't find end of stream!") iend = pdf.find(endmark, iend - 20) if iend < 0: raise Exception("Didn't find end of JPG!") istart += startfix iend += endfix print("JPG %d from %d to %d" % (njpg, istart, iend)) jpg = pdf[istart:iend] with open("jpg%d.jpg" % njpg,"wb") as jpgfile: jpgfile.write(jpg) njpg += 1 i = iend |
更简单的解决方案:
使用poppler-utils软件包。要安装它,请使用homebrew(homebrew是MacOS专用的,但是您可以在此处找到适用于Widows或Linux的poppler-utils软件包:https://poppler.freedesktop.org/)。下面的第一行代码使用自制软件安装poppler-utils。安装后,第二行(从命令行运行)然后从PDF文件提取图像并将其命名为" image *"。要从Python内部运行此程序,请使用os或subprocess模块??。第三行是使用os模块的代码,下面是带有子进程的示例(python 3.5或更高版本的run()函数)。此处提供更多信息:https://www.cyberciti.biz/faq/easily-extract-images-from-pdf-file/
1 2 | import os os.system('pdfimages file.pdf image') |
要么
1 2 | import subprocess subprocess.run('pdfimages file.pdf image', shell=True) |
我在服务器上安装了ImageMagick,然后通过
1 2 3 4 5 6 7 8 9 10 11 12 13 | #!/usr/bin/python import sys import os import subprocess import settings IMAGE_PATH = os.path.join(settings.MEDIA_ROOT , 'pdf_input' ) def extract_images(pdf): output = 'temp.png' cmd = 'convert ' + os.path.join(IMAGE_PATH, pdf) + ' ' + os.path.join(IMAGE_PATH, output) subprocess.Popen(cmd.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) |
这将为每个页面创建一个图像,并将它们存储为temp-0.png,temp-1.png...。
如果您得到的PDF仅包含图片而没有文本,则这只是"提取"。
这是我的2019年版本,它递归地从PDF获取所有图像并使用PIL读取它们。
与Python 2/3兼容。 我还发现zlib有时可能会压缩PDF中的图像,因此我的代码支持解压缩。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | #!/usr/bin/env python3 try: from StringIO import StringIO except ImportError: from io import BytesIO as StringIO from PIL import Image from PyPDF2 import PdfFileReader, generic import zlib def get_color_mode(obj): try: cspace = obj['/ColorSpace'] except KeyError: return None if cspace == '/DeviceRGB': return"RGB" elif cspace == '/DeviceCMYK': return"CMYK" elif cspace == '/DeviceGray': return"P" if isinstance(cspace, generic.ArrayObject) and cspace[0] == '/ICCBased': color_map = obj['/ColorSpace'][1].getObject()['/N'] if color_map == 1: return"P" elif color_map == 3: return"RGB" elif color_map == 4: return"CMYK" def get_object_images(x_obj): images = [] for obj_name in x_obj: sub_obj = x_obj[obj_name] if '/Resources' in sub_obj and '/XObject' in sub_obj['/Resources']: images += get_object_images(sub_obj['/Resources']['/XObject'].getObject()) elif sub_obj['/Subtype'] == '/Image': zlib_compressed = '/FlateDecode' in sub_obj.get('/Filter', '') if zlib_compressed: sub_obj._data = zlib.decompress(sub_obj._data) images.append(( get_color_mode(sub_obj), (sub_obj['/Width'], sub_obj['/Height']), sub_obj._data )) return images def get_pdf_images(pdf_fp): images = [] try: pdf_in = PdfFileReader(open(pdf_fp,"rb")) except: return images for p_n in range(pdf_in.numPages): page = pdf_in.getPage(p_n) try: page_x_obj = page['/Resources']['/XObject'].getObject() except KeyError: continue images += get_object_images(page_x_obj) return images if __name__ =="__main__": pdf_fp ="test.pdf" for image in get_pdf_images(pdf_fp): (mode, size, data) = image try: img = Image.open(StringIO(data)) except Exception as e: print ("Failed to read image with PIL: {}".format(e)) continue # Do whatever you want with the image |
截至2019年2月,@ sylvain提供的解决方案(至少在我的设置上)未经少量修改就无法使用:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import PyPDF2, traceback from PIL import Image input1 = PyPDF2.PdfFileReader(open(src,"rb")) nPages = input1.getNumPages() print nPages for i in range(nPages) : print i page0 = input1.getPage(i) try : xObject = page0['/Resources']['/XObject'].getObject() except : xObject = [] for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() try : if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode ="RGB" elif xObject[obj]['/ColorSpace'] == '/DeviceCMYK': mode ="CMYK" # will cause errors when saving else: mode ="P" fn = 'p%03d-%s' % (i + 1, obj[1:]) print '\t', fn if '/FlateDecode' in xObject[obj]['/Filter'] : img = Image.frombytes(mode, size, data) img.save(fn +".png") elif '/DCTDecode' in xObject[obj]['/Filter']: img = open(fn +".jpg","wb") img.write(data) img.close() elif '/JPXDecode' in xObject[obj]['/Filter'] : img = open(fn +".jp2","wb") img.write(data) img.close() elif '/LZWDecode' in xObject[obj]['/Filter'] : img = open(fn +".tif","wb") img.write(data) img.close() else : print 'Unknown format:', xObject[obj]['/Filter'] except : traceback.print_exc() |
您也可以在Ubuntu中使用
使用以下命令安装poppler lib。
1 2 3 4 5 | sudo apt install poppler-utils sudo apt-get install python-poppler pdfimages file.pdf image |
创建的文件列表为(例如,.pdf中有两个图像)
1 2 | image-000.png image-001.png |
有用 !现在,您可以使用
我在这里将所有这些加到了PyPDFTK中。
我自己的贡献是处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) color_space = xObject[obj]['/ColorSpace'] if isinstance(color_space, pdf.generic.ArrayObject) and color_space[0] == '/Indexed': color_space, base, hival, lookup = [v.getObject() for v in color_space] # pg 262 mode = img_modes[color_space] if xObject[obj]['/Filter'] == '/FlateDecode': data = xObject[obj].getData() img = Image.frombytes(mode, size, data) if color_space == '/Indexed': img.putpalette(lookup.getData()) img = img.convert('RGB') img.save("{}{:04}.png".format(filename_prefix, i)) |
请注意,找到
我的第一个直觉是将它们另存为GIF(这是一种索引格式),但是我的测试表明PNG较小,并且外观相同。
使用Foxit Reader PDF Printer打印到PDF时,我发现了这些类型的图像。