import re
import zipfile
import urllib.request
import os.path
import glob
re
: Abbreviation for Regular Expression, a module for manipulating regular expressionszipfile
: Module for manipulating zip filesglob
: Module to get the file path nameHere, Kenji Miyazawa's "Night on the Galactic Railroad" is used as the material.
URL = 'https://www.aozora.gr.jp/cards/000081/files/43737_ruby_19028.zip'
def download(URL):
zip_file = re.split(r'/', URL)[-1] #➀
urllib.request.urlretrieve(URL, zip_file) #➁
dir = os.path.splitext(zip_file)[0] #➂
with zipfile.ZipFile(zip_file) as zip_object: #➃
zip_object.extractall(dir) #➄
os.remove(zip_file) #➅
path = os.path.join(dir,'*.txt') #➆
list = glob.glob(path) #➇
return list[0] #➈
** 1) Download zip file **
re.split ()
: Separate the URL string with /
and get the zip file name "43737_ruby_19028.zip" at the end.: Divide the zip file name with a dot". "And get the file name
dir` without the extension.** 2) Unzip and save the zip file **
zipfile.ZipFile ()
: Read the previously saved zip file, create a zip object,: Extract all the contents of the zip object to the directory
dir`.** 3) Get the path of the saved file **
: Generates the path string of
dir`.glob.glob ()
: Outputs all text file names in the directory and lists them.list [0]
: Returns the path of the first file in the list.def convert(download_text):
data = open(download_text, 'rb').read() #➀
text = data.decode('shift_jis') #➁
#Text extraction
text = re.split(r'\-{5,}', text)[2] #➂
text = re.split(r'Bottom book:', text)[0] #➃
text = re.split(r'[#New Page]', text)[0] #➄
#Noise removal
text = re.sub(r'《.+?》', '', text) #➅
text = re.sub(r'[#.+?]', '', text) #➆
text = re.sub(r'|', '', text) #➇
text = re.sub(r'\r\n', '', text) #➈
text = re.sub(r'\u3000', '', text) #➉
return text
** 1) Read file **
: Read the file in
'rb'` (binary mode).decode ('shift_jis')
: Decode according to shift_jis
and get the text.** 2) Extracting the text with re.split ()
**
(r'\-{5,}', text) [2]
: Delete the part where the hyphen"-" is repeated 5 times or more, and use this as the delimiter for the third element Take out.(r'base:', text) [0]
: Delete "base:" and take out the first element divided by using this as a delimiter.(r'[# page break]', text) [0]
: Delete" [# page break] "and take out the first element divided by using this as a delimiter.** 3) Noise removal (replacement) by re.sub ()
**
'《. +?》'
: 《Ruby》'[#. +?]'
: [Note]'|'
: Start position of character string with ruby'\ r \ n'
: Line feed code'\ u3000'
: Full-width spacedownload_file = download(URL)
text = convert(download_file)
print(text)
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7
MeCab.Tagger ()
with the argument -Owakati
and then calling the methodparse ()
.import MeCab
mecab = MeCab.Tagger("-Owakati")
text = mecab.parse(text)
print(text)
split ()
splits the string with spaces as delimiters.separated_text = text.split()
print(separated_text)
with open('output.txt', 'w') as f:
f.write(text)
text
to a file called'output.txt'
. The argument 'w'
is the write mode specification.from google.colab import files
files.download('output.txt')
files
is a module for uploading or downloading files between Colaboratory and your local PC.Recommended Posts