Last web scraping code I thought it was annoying that I could only scrape one by one. Also, I want to keep the url that I used once, and I don't want to change the file name once. So this time I tried to improve that area quickly.
This time I added datetime to the library. It would be convenient if the name of the daily ranking etc. changes when you save it.
import re
import time
import datetime
import requests
import pandas as pd
from bs4 import BeautifulSoup
Removed the removal of alphanumeric characters and symbols in the parse function. Since the half-width space between English words disappears, I thought that symbols should be processed in natural language.
# Get the website and output it in text format
def load(url):
res = requests.get(url)
#HTTPError throws HTTPError if the HTTP request returns a failed status code
res.raise_for_status()
#Get response body in text format
return res.text
# Get html tag
def get_tag(html, find_tag):
soup = BeautifulSoup(str(html), 'html.parser')
tag = soup.find_all(find_tag)
return tag
# Convert to a data structure that can be handled by the program
def parse(html):
soup = BeautifulSoup(str(html), 'html.parser')
Remove #html tag
simple_row = soup.getText()
#simple_row = simple_row.replace(' ', ' ')
#simple_row = simple_row.replace('\n', ' ')
#Delete alphanumeric characters (if needed)
#simple_row = re.sub(r'[a-zA-Z0-9]', '', music_row)
#Delete symbol (if necessary) * The difficulty is that the space between English words disappears when you use this
#simple_row = re.sub (r'[<> ♪ `''" "・… _!?!-/:-@ [-` {-~]','', simple_row)
#Delete notice
simple_row = re.sub (r'Note:. +','', Simple_row)
# Full-width space to half-width (to leave lyrics break)
simple_row = simple_row.replace(' ', ' ')
simple_row = simple_row.replace('\n', '')
return simple_row
I changed the function create_df a little.
# Acquisition of song information for each
def get_info(url):
base_url = 'https://www.uta-net.com/'
html = load(url)
#Store url for each song
song_url = []
#Store song
song_info = []
songs_info=[]
#Get song url
Store url of #td
for td in get_tag(html, 'td'):
Get #a element
for a in get_tag(td, 'a'):
Whether the #href attribute contains song
if 'song' in a.get ('href'):
Add #url to array
song_url.append(base_url + a.get('href'))
#Get song information
for i, page in enumerate(song_url):
print ('{} song: {}'. format (i + 1, page))
html = load(page)
song_info = []
#Song_Title
for h2 in get_tag(html, 'h2'):
Cast to str once to do #id search
h2 = str(h2)
#Whether or not it is a class element that stores lyrics
if r'class="prev_pad"' in h2:
#Remove unnecessary data
simple_row = parse(h2)
song_info.append(simple_row)
else:
for h2 in get_tag(html, 'h2'):
h2 = str(h2)
simple_row = parse(h2)
song_info.append(simple_row)
#Artist
for h3 in get_tag(html, 'h3'):
h3 = str(h3)
if r'itemprop="byArtist"' in h3:
simple_row = parse(h3)
song_info.append(simple_row)
#Lyricist
for h4 in get_tag(html, 'h4'):
h4 = str(h4)
if r'itemprop="lyricist"' in h4:
music = parse(h4)
song_info.append(simple_row)
#Composer
for h4 in get_tag(html, 'h4'):
h4 = str(h4)
if r'itemprop="composer"' in h4:
simple_row = parse(h4)
song_info.append(simple_row)
#Lyric
for div in get_tag(html, 'div'):
div = str(div)
if r'itemprop="text"' in div:
simple_row = parse(div)
song_info.append(simple_row)
songs_info.append(song_info)
# 1 second wait (reduces server load)
time.sleep(1)
break
#print(songs_info)
return songs_info
csv
def create_df(URL):
file_name = URL[0]
url = URL[1]
#Create a data frame
df = pd.DataFrame(get_info(url))
df = df.rename(columns={0:'Song_Title', 1:'Artist', 2:'Lyricist', 3:'Composer', 4:'Lyric'})
# CSV file output
csv = df.to_csv("csv/{}.csv".format(file_name))
return csv
# Bulk installation
def whole(URL):
for i in range(len(URL)):
URLS = URL[i]
create_df(URLS)
return
It is a part of the created list. Place the file name in the left position (URL_fre [0] [0]). Put the url in the right position (URL_fre [0] [1]). With this, the file name is decided arbitrarily and the lyrics etc. are acquired.
# Rankings that are updated regularly
URL_fre = [['{}_daily'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/daily.html'],
['{}_weekly'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/weekly.html'],
['{}_monthly'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/monthly.html']
]
# Part of the list (I like it)
URL_setting = [['Yorushika',' https://www.uta-net.com/artist/22653/'],
['YOASOBI', 'https://www.uta-net.com/artist/28370/'],
['RADWIMPS', 'https://www.uta-net.com/artist/4082/']
]
Rename the list (URL) and run it to complete.
whole(URL)
Now you can collect them in a list and scrape them all at once. Now, I would like to touch on natural language processing this time!
Recommended Posts