Snippets registered in Google Colaboratory (PDF text conversion)

PDF text conversion

pdfminer

command

!pip install pdfminer.six
!python /usr/local/bin/pdf2txt.py -o data.txt data.pdf

Python

from pdfminer.high_level import extract_text

text = extract_text("data.pdf")

pdfbox

command

#URL changes with the latest version
!wget https://www-eu.apache.org/dist/pdfbox/2.0.21/pdfbox-app-2.0.21.jar -O pdfbox-app.jar

#Text conversion
!java -jar pdfbox-app.jar ExtractText -sort -encoding UTF-8 data.pdf

#image
!java -jar pdfbox-app.jar PDFToImage -imageType png -dpi 300 data.pdf

Python

!pip install python-pdfbox

import pdfbox

p = pdfbox.PDFBox()
p.extract_text("data.pdf", sort=True)

poppler

command

!apt install poppler-utils poppler-data

#Text conversion
!pdftotext -layout data.pdf

!pdfinfo data.pdf

#repair
!pdftocairo -pdf data.pdf data_repaired.pdf

PDF table conversion

tabula

command

!wget https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar -O tabula.jar

# lattice
!java -jar tabula.jar -o data.csv -p all -l data.pdf

# stream
!java -jar tabula.jar -o data.csv -p all -t data.pdf

Python

!pip install tabula-py

import pandas as pd
from tabula import read_pdf

dfs = read_pdf("data.pdf", pages="all", lattice=True)

dfs = read_pdf("data.pdf", pages="all", lattice=True, pandas_options={"header": None})

Camelot

command

!apt install python3-tk ghostscript
!pip install camelot-py[cv]

# !pip install camelot-py[plot]

# !camelot --help

!camelot -p all -o data.csv -f csv lattice data.pdf

!camelot -p all -o data.csv -f csv -strip ' .\n' -split lattice -scale 40 data.pdf

Python

import camelot

tables = camelot.read_pdf("data.pdf", pages="all", split_text=True, strip_text=" \n", line_scale=40)

pdfplumber

!pip install pdfplumber
!apt install libmagickwand-dev ghostscript
#PDF can be converted to images/etc/ImageMagick-6/policy.Overwrite xml
%%writefile /etc/ImageMagick-6/policy.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE policymap>

<policymap>

  <policy domain="resource" name="memory" value="256MiB"/>
  <policy domain="resource" name="map" value="512MiB"/>
  <policy domain="resource" name="width" value="16KP"/>
  <policy domain="resource" name="height" value="16KP"/>
  <policy domain="resource" name="area" value="128MB"/>
  <policy domain="resource" name="disk" value="1GiB"/>

  <policy domain="delegate" rights="none" pattern="URL"/>
  <policy domain="delegate" rights="none" pattern="HTTPS"/>
  <policy domain="delegate" rights="none" pattern="HTTP"/>

  <policy domain="path" rights="none" pattern="@*"/>
  <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/>

  <policy domain="coder" rights="none" pattern="PS"/>
  <policy domain="coder" rights="none" pattern="PS2"/>
  <policy domain="coder" rights="none" pattern="PS3"/>
  <policy domain="coder" rights="none" pattern="EPS"/>
  <policy domain="coder" rights="read|write" pattern="PDF" />
  <policy domain="coder" rights="none" pattern="XPS"/>
</policymap>
!pdfplumber < data.pdf > data.csv

import pdfplumber
import pandas as pd

pdf = pdfplumber.open("data.pdf")

page = pdf.pages[0]

page.find_tables()[0]

#Confirm the position of characters
page.chars

#Get text with crop
crop = page.within_bbox((0, 90, p0.width, 105))

s = crop.extract_text()
s

#PDF confirmation
im = page.to_image()
im

table_settings = {

    #Vertical reference
    "vertical_strategy": "lines",
    #Specify the vertical delimiter numerically (list)
    "explicit_vertical_lines": [],

    #Horizontal reference
    "horizontal_strategy": "lines",
    #Specify the horizontal division numerically (list)
    "explicit_horizontal_lines": [],

    #If it is within the allowable range, adjust to the same horizontal position or vertical position * Adjust above?
    "snap_tolerance": 3,

    #If within tolerance, join
    "join_tolerance": 3,

    #Shorter edges than destroyed before attempting to rebuild the table?
    "edge_min_length": 3,

    #Minimum character height
    "min_words_vertical": 3,

    #Minimum character height
    "min_words_horizontal": 1,

    #Recognize whitespace as part of a word and do not use it as a delimiter
    "keep_blank_chars": False,

    #Recognized as a word if the character spacing is as follows
    "text_tolerance": 3,
    "text_x_tolerance": None,
    "text_y_tolerance": None,
    
    #Tolerance if the left and right edges of the text do not exactly match the vertical lines?
    "intersection_tolerance": 3,
    "intersection_x_tolerance": None,
    "intersection_y_tolerance": None,
}

#Character confirmation
im.reset().draw_rects(page.extract_words())

#Check the table
im.reset().debug_tablefinder()

with pdfplumber.open("data.pdf") as pdf:

    dfs = []

    for page in pdf.pages:

        table = page.extract_table(table_settings)

        df_tmp = pd.DataFrame(table[1:], columns=table[0])

        dfs.append(df_tmp)

df = pd.concat(dfs)

OCR

tesseract-ocr

!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v

!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract

try:
    from PIL import Image
except ImportError:
    import Image
    
import pytesseract

import cv2
import numpy as np

from google.colab.patches import cv2_imshow

img = cv2.imread("test.jpg ")

#Black and white inversion
img_gray, _ = cv2.decolor(img)

cv2_imshow(img_gray)

Recommended Posts

Snippets registered in Google Colaboratory (PDF text conversion)
Snippets (scraping) registered in Google Colaboratory
Google colaboratory
Use cartopy without bugs in Google Colaboratory
Put text scraped in Python into Google Sheets
How to load files in Google Drive with Google Colaboratory
How to use Spacy Japanese model in Google Colaboratory
I can't use the darknet command in Google Colaboratory!