Text extraction from the image of the judgment criteria of Information on new coronavirus infection in Hyogo prefecture
If you think that it is a text at "Currently it is a special period of infection spread", it is an image
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "https://web.pref.hyogo.lg.jp/index.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.select_one("div#tmp_contents > p > img")
link = urljoin(url, tag.get("src"))
r = requests.get(link, headers=headers)
r.raise_for_status()
with open("alert.png ", mode="wb") as fw:
fw.write(r.content)
OCR
!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v
!apt install tesseract-ocr-jpn tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract
import pytesseract
import cv2
import numpy as np
from google.colab.patches import cv2_imshow
#There is a black one left on the edge, so cut it out a little
img_bgr = cv2.imread("alert.png ")[10:-10, 10:-10]
#grayscale
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
#Color confirmation
img_bgr[10, 10]
#Check the image
cv2_imshow(img_gray)
#Color count
black = np.sum(img_gray < 151)
white = np.sum(img_gray > 150)
#Check which is more white or black, and if there is more black, reverse
if white < black:
ret, thresh = cv2.threshold(img_gray, 150, 255, cv2.THRESH_BINARY_INV)
else:
ret, thresh = cv2.threshold(img_gray, 150, 255, cv2.THRESH_BINARY)
#Check the image
cv2_imshow(thresh)
txt = pytesseract.image_to_string(thresh, lang="jpn", config="--psm 6").strip()
txt
Recommended Posts