本文的方法主要实现批处理pdf2txt。强推方法二!!!
方法一:使用pdfminer3k
参考来自GitHub的代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | ###################################### # tesseract OCR from PIL import Image import pytesseract def img_to_str_tesseract(image_path, lang='chi_sim'): return pytesseract.image_to_string(Image.open(image_path), lang) ###################################### # 百度 OCR from aip import AipOcr config = { 'appId': '', 'apiKey': '', 'secretKey': '' } client = AipOcr(**config) def img_to_str_baidu(image_path): with open(image_path, 'rb') as fp: image = fp.read() result = client.basicGeneral(image) if 'words_result' in result: return '\n'.join([w['words'] for w in result['words_result']]) return "" ###################################### # 解析PDF文件 from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from pdfminer.pdfparser import PDFParser,PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTChar, LTTextLine from pdfminer.pdfinterp import PDFTextExtractionNotAllowed import os import sys import numpy as np import importlib importlib.reload(sys) TMPDIR = 'tmp/' PARSEIMG = True OCR_ONLINE = False # 保存图片 def write_image(image, outdir): stream = image.stream filters = stream.get_filters() if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: ext = '.jpg' data = stream.get_rawdata() elif image.colorspace is LITERAL_DEVICE_RGB: ext = '.bmp' data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height) elif image.colorspace is LITERAL_DEVICE_GRAY: ext = '.bmp' data = create_bmp(stream.get_data(), stream.bits, image.width, image.height) else: ext = '.img' data = stream.get_data() name = image.name+ext path = os.path.join(outdir, name) fp = open(path, 'wb') fp.write(data) fp.close() return path, len(data) # 写入文件 def write_file(path, text, ftype, debug=False): with open(path, ftype) as f: if debug: print("write", len(text)) f.write(text) # 去掉文中多余的回车 def adjust(inpath, outpath): f = open(inpath) lines = f.readlines() arr = [len(line) for line in lines] length = np.median(arr) # 行字符数中值 string = "" for line in lines: if len(line) >= length and line[-1]=='\n': string += line[:-1] # 去掉句尾的回车 elif line == '-----------\n': pass else: string += line write_file(outpath, string, 'w') return # 解析每个数据块 def parse_section(layout, outpath, debug = False): for x in layout: if (isinstance(x, LTTextBoxHorizontal)): # 文本 write_file(outpath, x.get_text(), 'a') elif (isinstance(x, LTFigure)): parse_section(x, outpath) elif (isinstance(x, LTImage)) and PARSEIMG: # 图片 path,length = write_image(x, TMPDIR) if length > 0: if OCR_ONLINE: write_file(outpath, img_to_str_baidu(path), 'a') else: write_file(outpath, img_to_str_tesseract(path), 'a') write_file(outpath, '\n' + '-----------' + '\n', 'a') # 删除文件 def remove(path): if not os.path.exists(path): return if os.path.isfile(path): os.remove(path) return dirs = os.listdir(path) for f in dirs: file_name = os.path.join(path, f) if os.path.isfile(file_name): os.remove(file_name) else: remove(file_name) os.rmdir(path) # 解析PDF文件 def parse(inpath, outpath): remove(TMPDIR) # 清除临时目录 os.mkdir(TMPDIR) remove(outpath) # 清除输出文件 fp = open(inpath, 'rb') praser = PDFParser(fp) # pdf文档分析器 doc = PDFDocument(praser)# 创建一个PDF文档 praser.set_document(doc) # 连接分析器与文档对象 doc.set_parser(praser) doc.initialize("") if not doc.is_extractable: # 是否提供txt转换 raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() # 创建PDF资源管理器 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象 for idx,page in enumerate(doc.get_pages()): # 获取page列表 interpreter.process_page(page) layout = device.get_result() print("parse", idx) parse_section(layout, outpath) if __name__ == '__main__': #批处理 target_path = sys.argv[1] for base_path,folder_list,file_list in os.walk(target_path): for file_name in file_list: file_path = file_name if file_path[-3:] != 'pdf': # 不是pdf文件 continue try: pdffile = file_path tmpfile = pdffile.replace('pdf','tmp') txtfile = pdffile.replace('pdf','txt') parse(pdffile, tmpfile) adjust(tmpfile, txtfile) remove(tmpfile) #删除tmp except Exception as e: print(file_name," error!") |
笔者根据自己的需要进行过调整。
!坑
一定不要安装pdfminer,是pdfminer3k啊大兄弟。如果不幸安装了,可以unstall pdfminer,pdfminer3k,然后再install pdfminer3k,根据提示把所有已经存在的包删掉,再install pdfminer3k
跪求大神解答为何pdfminer和pdfminer3k会导包混乱
方法二:借助xpdf
参考自知乎,根据自己的需要和pdfminer3k代码进行优化:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import numpy as np import os import subprocess from os.path import isfile,join ef = r'./xpdf/pdftotext.exe' cfg = r'./xpdf/xpdfrc' def convert(file_name_pdf): file_name_pdf = join(r'./resourses',file_name_pdf) bo = subprocess.check_output([ef,'-f','1','-l','1000','-cfg',cfg,'-raw',file_name_pdf,'-']) #这个命令中的所有调用文件参数必须使用full path.否则调用出错。 return bo.decode('utf-8') def write_file(bo,file_name,method="wb"): file_name = join(r'./results/',file_name) with open(file_name,method) as f: f.write(bo) # 去除换行 def adjust(inpath, outpath): inpath = join(r'./results/',inpath) f = open(inpath,encoding='utf-8') lines = f.readlines() arr = [len(line) for line in lines] length = np.median(arr) # 行字符数中值 string = "" for line in lines: if len(line) >= length and line[-1]=='\n': string += line[:-1] # 去掉句尾的回车 elif line == '-----------\n': pass else: string += line string=string.encode('utf-8') write_file(string, outpath) def rm(inpath): inpath = join(r'./results/',inpath) os.remove(inpath) if __name__ == '__main__': #批处理 su_count = 0 er_count = 0 count = 0 target_path = r'./resourses' for base_path,folder_list,file_list in os.walk(target_path): for file_name in file_list: if file_name[-3:] != 'pdf': # 不是pdf文件 continue try: pdffile = file_name tmpfile = pdffile.replace('pdf','tmp') txtfile = pdffile.replace('pdf','txt') bo = convert(pdffile).encode('utf-8') write_file(bo,tmpfile) adjust(tmpfile, txtfile) rm(tmpfile) su_count += 1 count += 1 print(count,"-->",file_name," success!\n ") except Exception as e: er_count += 1 count += 1 print(count,"-->",file_name," error!\n ") print("\ncount: ",count,"\n","success: ",su_count,"\n","error: ",er_count) |
感谢乐于在网络上分享的大神们,终于不用一篇一篇转了!
链接在这:
链接:https://pan.baidu.com/s/1QW3XMAvf8qJlaLHxmUBEXg
提取码:tw95
使用的小伙伴记得提前看README.md,我踩过的坑你们一定不能再踩了,挥泪~~