使用python实现pdf2txt

本文的方法主要实现批处理pdf2txt。强推方法二！！！

方法一：使用pdfminer3k

参考来自GitHub的代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

######################################
# tesseract OCR

from PIL import Image
import pytesseract

def img_to_str_tesseract(image_path, lang='chi_sim'):
return pytesseract.image_to_string(Image.open(image_path), lang)

######################################
# 百度 OCR

from aip import AipOcr

config = {
'appId': '',
'apiKey': '',
'secretKey': ''
}
client = AipOcr(**config)

def img_to_str_baidu(image_path):
with open(image_path, 'rb') as fp:
image = fp.read()
result = client.basicGeneral(image)
if 'words_result' in result:
return '\n'.join([w['words'] for w in result['words_result']])
return ""

######################################
# 解析PDF文件

from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTChar, LTTextLine
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os
import sys
import numpy as np
import importlib
importlib.reload(sys)

TMPDIR = 'tmp/'
PARSEIMG = True
OCR_ONLINE = False

# 保存图片
def write_image(image, outdir):
stream = image.stream
filters = stream.get_filters()
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
data = stream.get_rawdata()
elif image.colorspace is LITERAL_DEVICE_RGB:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
elif image.colorspace is LITERAL_DEVICE_GRAY:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
else:
ext = '.img'
data = stream.get_data()
name = image.name+ext
path = os.path.join(outdir, name)
fp = open(path, 'wb')
fp.write(data)
fp.close()
return path, len(data)

# 写入文件
def write_file(path, text, ftype, debug=False):
with open(path, ftype) as f:
if debug:
print("write", len(text))
f.write(text)

# 去掉文中多余的回车
def adjust(inpath, outpath):
f = open(inpath)
lines = f.readlines()
arr = [len(line) for line in lines]
length = np.median(arr) # 行字符数中值

string = ""
for line in lines:
if len(line) >= length and line[-1]=='\n':
string += line[:-1] # 去掉句尾的回车
elif line == '-----------\n':
pass
else:
string += line
write_file(outpath, string, 'w')
return

# 解析每个数据块
def parse_section(layout, outpath, debug = False):
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)): # 文本
write_file(outpath, x.get_text(), 'a')
elif (isinstance(x, LTFigure)):
parse_section(x, outpath)
elif (isinstance(x, LTImage)) and PARSEIMG: # 图片
path,length = write_image(x, TMPDIR)
if length > 0:
if OCR_ONLINE:
write_file(outpath, img_to_str_baidu(path), 'a')
else:
write_file(outpath, img_to_str_tesseract(path), 'a')
write_file(outpath, '\n' + '-----------' + '\n', 'a')

# 删除文件
def remove(path):
if not os.path.exists(path):
return
if os.path.isfile(path):
os.remove(path)
return
dirs = os.listdir(path)
for f in dirs:
file_name = os.path.join(path, f)
if os.path.isfile(file_name):
os.remove(file_name)
else:
remove(file_name)
os.rmdir(path)

# 解析PDF文件
def parse(inpath, outpath):
remove(TMPDIR) # 清除临时目录
os.mkdir(TMPDIR)
remove(outpath) # 清除输出文件
fp = open(inpath, 'rb')
praser = PDFParser(fp) # pdf文档分析器
doc = PDFDocument(praser)# 创建一个PDF文档
praser.set_document(doc) # 连接分析器与文档对象
doc.set_parser(praser)
doc.initialize("")
if not doc.is_extractable: # 是否提供txt转换
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager() # 创建PDF资源管理器
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象

for idx,page in enumerate(doc.get_pages()): # 获取page列表
interpreter.process_page(page)
layout = device.get_result()
print("parse", idx)
parse_section(layout, outpath)

if __name__ == '__main__':
#批处理
target_path = sys.argv[1]
for base_path,folder_list,file_list in os.walk(target_path):
for file_name in file_list:
file_path = file_name
if file_path[-3:] != 'pdf':
# 不是pdf文件
continue
try:
pdffile = file_path
tmpfile = pdffile.replace('pdf','tmp')
txtfile = pdffile.replace('pdf','txt')
parse(pdffile, tmpfile)
adjust(tmpfile, txtfile)
remove(tmpfile) #删除tmp
except Exception as e:
print(file_name," error!")

笔者根据自己的需要进行过调整。

！坑

一定不要安装pdfminer，是pdfminer3k啊大兄弟。如果不幸安装了，可以unstall pdfminer，pdfminer3k，然后再install pdfminer3k，根据提示把所有已经存在的包删掉，再install pdfminer3k

跪求大神解答为何pdfminer和pdfminer3k会导包混乱

方法二：借助xpdf

参考自知乎，根据自己的需要和pdfminer3k代码进行优化：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

import numpy as np
import os
import subprocess
from os.path import isfile,join

ef = r'./xpdf/pdftotext.exe'
cfg = r'./xpdf/xpdfrc'

def convert(file_name_pdf):
file_name_pdf = join(r'./resourses',file_name_pdf)
bo = subprocess.check_output([ef,'-f','1','-l','1000','-cfg',cfg,'-raw',file_name_pdf,'-']) #这个命令中的所有调用文件参数必须使用full path.否则调用出错。
return bo.decode('utf-8')

def write_file(bo,file_name,method="wb"):
file_name = join(r'./results/',file_name)
with open(file_name,method) as f:
f.write(bo)

# 去除换行
def adjust(inpath, outpath):
inpath = join(r'./results/',inpath)
f = open(inpath,encoding='utf-8')
lines = f.readlines()
arr = [len(line) for line in lines]
length = np.median(arr) # 行字符数中值
string = ""
for line in lines:
if len(line) >= length and line[-1]=='\n':
string += line[:-1] # 去掉句尾的回车
elif line == '-----------\n':
pass
else:
string += line
string=string.encode('utf-8')
write_file(string, outpath)

def rm(inpath):
inpath = join(r'./results/',inpath)
os.remove(inpath)

if __name__ == '__main__':
#批处理
su_count = 0
er_count = 0
count = 0
target_path = r'./resourses'
for base_path,folder_list,file_list in os.walk(target_path):
for file_name in file_list:
if file_name[-3:] != 'pdf':
# 不是pdf文件
continue
try:
pdffile = file_name
tmpfile = pdffile.replace('pdf','tmp')
txtfile = pdffile.replace('pdf','txt')
bo = convert(pdffile).encode('utf-8')
write_file(bo,tmpfile)
adjust(tmpfile, txtfile)
rm(tmpfile)
su_count += 1
count += 1
print(count,"-->",file_name," success!\n ")
except Exception as e:
er_count += 1
count += 1
print(count,"-->",file_name," error!\n ")

print("\ncount: ",count,"\n","success: ",su_count,"\n","error: ",er_count)

感谢乐于在网络上分享的大神们，终于不用一篇一篇转了！

链接在这：
链接：https://pan.baidu.com/s/1QW3XMAvf8qJlaLHxmUBEXg
提取码：tw95

使用的小伙伴记得提前看README.md，我踩过的坑你们一定不能再踩了，挥泪~~