PDF文本转换
pdfminer
命令
1 2 | !pip install pdfminer.six !python /usr/local/bin/pdf2txt.py -o data.txt data.pdf |
的Python
1 2 3 | from pdfminer.high_level import extract_text text = extract_text("data.pdf") |
pdfbox
命令
1 2 3 4 5 6 7 8 | # 最新バージョンでURLが変わる !wget https://www-eu.apache.org/dist/pdfbox/2.0.21/pdfbox-app-2.0.21.jar -O pdfbox-app.jar # テキスト変換 !java -jar pdfbox-app.jar ExtractText -sort -encoding UTF-8 data.pdf # 画像 !java -jar pdfbox-app.jar PDFToImage -imageType png -dpi 300 data.pdf |
的Python
1 2 3 4 5 6 | !pip install python-pdfbox import pdfbox p = pdfbox.PDFBox() p.extract_text("data.pdf", sort=True) |
波普勒
命令
1 2 3 4 5 6 7 8 9 | !apt install poppler-utils poppler-data # テキスト変換 !pdftotext -layout data.pdf !pdfinfo data.pdf # リペア !pdftocairo -pdf data.pdf data_repaired.pdf |
PDF表格转换
表格
命令
1 2 3 4 5 6 7 | !wget https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar -O tabula.jar # lattice !java -jar tabula.jar -o data.csv -p all -l data.pdf # stream !java -jar tabula.jar -o data.csv -p all -t data.pdf |
的Python
1 2 3 4 5 6 7 8 | !pip install tabula-py import pandas as pd from tabula import read_pdf dfs = read_pdf("data.pdf", pages="all", lattice=True) dfs = read_pdf("data.pdf", pages="all", lattice=True, pandas_options={"header": None}) |
卡米洛特
命令
1 2 3 4 5 6 7 8 9 10 | !apt install python3-tk ghostscript !pip install camelot-py[cv] # !pip install camelot-py[plot] # !camelot --help !camelot -p all -o data.csv -f csv lattice data.pdf !camelot -p all -o data.csv -f csv -strip ' .\n' -split lattice -scale 40 data.pdf |
的Python
1 2 3 | import camelot tables = camelot.read_pdf("data.pdf", pages="all", split_text=True, strip_text=" \n", line_scale=40) |
pdfplumber
1 2 | !pip install pdfplumber !apt install libmagickwand-dev ghostscript |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | # PDFを画像変換できるように/etc/ImageMagick-6/policy.xmlを上書き %%writefile /etc/ImageMagick-6/policy.xml <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE policymap> <policymap> <policy domain="resource" name="memory" value="256MiB"/> <policy domain="resource" name="map" value="512MiB"/> <policy domain="resource" name="width" value="16KP"/> <policy domain="resource" name="height" value="16KP"/> <policy domain="resource" name="area" value="128MB"/> <policy domain="resource" name="disk" value="1GiB"/> <policy domain="delegate" rights="none" pattern="URL"/> <policy domain="delegate" rights="none" pattern="HTTPS"/> <policy domain="delegate" rights="none" pattern="HTTP"/> <policy domain="path" rights="none" pattern="@*"/> <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/> <policy domain="coder" rights="none" pattern="PS"/> <policy domain="coder" rights="none" pattern="PS2"/> <policy domain="coder" rights="none" pattern="PS3"/> <policy domain="coder" rights="none" pattern="EPS"/> <policy domain="coder" rights="read|write" pattern="PDF" /> <policy domain="coder" rights="none" pattern="XPS"/> </policymap> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | !pdfplumber < data.pdf > data.csv import pdfplumber import pandas as pd pdf = pdfplumber.open("data.pdf") page = pdf.pages[0] page.find_tables()[0] # 文字の位置確認 page.chars # cropでテキスト取得 crop = page.within_bbox((0, 90, page.width, 105)) s = crop.extract_text() s # PDF確認 im = page.to_image() im table_settings = { # 垂直基準 "vertical_strategy": "lines", # 垂直区切を数値指定(リスト) "explicit_vertical_lines": [], # 水平基準 "horizontal_strategy": "lines", # 水平区切を数値指定(リスト) "explicit_horizontal_lines": [], # 許容範囲内の場合、同じ水平位置または垂直位置に調整 ※上に調整? "snap_tolerance": 3, # 許容範囲内の場合、結合 "join_tolerance": 3, # テーブルの再構築を試みる前に破棄されるよりも短いエッジ? "edge_min_length": 3, # 最小の文字の高さ "min_words_vertical": 3, # 最小の文字の高さ "min_words_horizontal": 1, # 空白文字を単語の一部と認識し、区切り文字にしない "keep_blank_chars": False, # 文字の間隔が以下の場合、単語と認識 "text_tolerance": 3, "text_x_tolerance": None, "text_y_tolerance": None, # テキストの左端と右端が垂直線と完全に一致していない場合の許容値? "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, } # 文字確認 im.reset().draw_rects(page.extract_words()) # テーブル確認 im.reset().debug_tablefinder() with pdfplumber.open("data.pdf") as pdf: dfs = [] for page in pdf.pages: table = page.extract_table(table_settings) df_tmp = pd.DataFrame(table[1:], columns=table[0]) dfs.append(df_tmp) df = pd.concat(dfs) |
光学字符识别
tesseract-ocr
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | !add-apt-repository ppa:alex-p/tesseract-ocr -y !apt update !apt install tesseract-ocr !apt install libtesseract-dev !tesseract -v !apt install tesseract-ocr-jpn tesseract-ocr-jpn-vert !apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert !tesseract --list-langs !pip install pytesseract try: from PIL import Image except ImportError: import Image import pytesseract import cv2 import numpy as np from google.colab.patches import cv2_imshow img = cv2.imread("test.jpg") # 白黒反転 img_gray, _ = cv2.decolor(img) cv2_imshow(img_gray) |