Introduction

Last web scraping code I thought it was annoying that I could only scrape one by one. Also, I want to keep the url that I used once, and I don't want to change the file name once. So this time I tried to improve that area quickly.

Library

This time I added datetime to the library. It would be convenient if the name of the daily ranking etc. changes when you save it.

import re
import time
import datetime
import requests
import pandas as pd
from bs4 import BeautifulSoup

Preparation part1

Removed the removal of alphanumeric characters and symbols in the parse function. Since the half-width space between English words disappears, I thought that symbols should be processed in natural language.

# Get the website and output it in text format
def load(url):
    res = requests.get(url)
 #HTTPError throws HTTPError if the HTTP request returns a failed status code
    res.raise_for_status()
 #Get response body in text format
    return res.text

# Get html tag
def get_tag(html, find_tag):
    soup = BeautifulSoup(str(html), 'html.parser')
    tag = soup.find_all(find_tag)
    return tag

# Convert to a data structure that can be handled by the program
def parse(html):
    soup = BeautifulSoup(str(html), 'html.parser')
 Remove #html tag
    simple_row = soup.getText()
    #simple_row = simple_row.replace('　', ' ')
    #simple_row = simple_row.replace('\n', ' ')
    
 #Delete alphanumeric characters (if needed)
    #simple_row = re.sub(r'[a-zA-Z0-9]', '', music_row)
 #Delete symbol (if necessary) * The difficulty is that the space between English words disappears when you use this
 #simple_row = re.sub (r'[<> ♪ `''" "・… _!?!-/:-@ [-` {-~]','', simple_row)
 #Delete notice
 simple_row = re.sub (r'Note:. +','', Simple_row)
    
 # Full-width space to half-width (to leave lyrics break)
    simple_row = simple_row.replace('　', ' ')
    simple_row = simple_row.replace('\n', '')
    return simple_row

Preparation part2

I changed the function create_df a little.

# Acquisition of song information for each
def get_info(url):
    base_url = 'https://www.uta-net.com/'
    html = load(url)
 #Store url for each song
    song_url = []
 #Store song
    song_info = []
    songs_info=[]
    
 #Get song url
 Store url of #td
    for td in get_tag(html, 'td'):
 Get #a element
        for a in get_tag(td, 'a'):
 Whether the #href attribute contains song
            if 'song' in a.get ('href'):
 Add #url to array
                song_url.append(base_url + a.get('href'))
    
 #Get song information
    for i, page in enumerate(song_url):
 print ('{} song: {}'. format (i + 1, page))
        html = load(page)
        song_info = []
        
        #Song_Title
        for h2 in get_tag(html, 'h2'):
 Cast to str once to do #id search
            h2 = str(h2)
 #Whether or not it is a class element that stores lyrics
            if r'class="prev_pad"' in h2:
 #Remove unnecessary data
                simple_row = parse(h2)
                song_info.append(simple_row)   
            else:
                for h2 in get_tag(html, 'h2'):
                    h2 = str(h2)
                    simple_row = parse(h2)
                    song_info.append(simple_row)

        #Artist
        for h3 in get_tag(html, 'h3'):
            h3 = str(h3)
            if r'itemprop="byArtist"' in h3:
                simple_row = parse(h3)
                song_info.append(simple_row)

        #Lyricist
        for h4 in get_tag(html, 'h4'):
            h4 = str(h4)
            if r'itemprop="lyricist"' in h4:
                music = parse(h4)
                song_info.append(simple_row)

        #Composer
        for h4 in get_tag(html, 'h4'):
            h4 = str(h4)
            if r'itemprop="composer"' in h4:
                simple_row = parse(h4)
                song_info.append(simple_row)

        #Lyric
        for div in get_tag(html, 'div'):
            div = str(div)
            if r'itemprop="text"' in div:
                simple_row = parse(div)
                song_info.append(simple_row)
                songs_info.append(song_info)

 # 1 second wait (reduces server load)
                time.sleep(1)
                break
                
    #print(songs_info)
    return songs_info

csv
def create_df(URL):
    file_name = URL[0]
    url = URL[1]
 #Create a data frame
    df = pd.DataFrame(get_info(url))
    df = df.rename(columns={0:'Song_Title', 1:'Artist', 2:'Lyricist', 3:'Composer', 4:'Lyric'})
 # CSV file output
    csv = df.to_csv("csv/{}.csv".format(file_name))    
    return csv

# Bulk installation
def whole(URL):
    for i in range(len(URL)):
        URLS = URL[i]
        create_df(URLS)
    return

list

It is a part of the created list. Place the file name in the left position (URL_fre [0] [0]). Put the url in the right position (URL_fre [0] [1]). With this, the file name is decided arbitrarily and the lyrics etc. are acquired.

# Rankings that are updated regularly
URL_fre = [['{}_daily'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/daily.html'],
           ['{}_weekly'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/weekly.html'],
           ['{}_monthly'.format(datetime.date.today()), 'https://www.uta-net.com/user/ranking/monthly.html']
          ]

# Part of the list (I like it)
 URL_setting = [['Yorushika',' https://www.uta-net.com/artist/22653/'],
               ['YOASOBI', 'https://www.uta-net.com/artist/28370/'],
               ['RADWIMPS', 'https://www.uta-net.com/artist/4082/']
              ]

Run

Rename the list (URL) and run it to complete.

whole(URL)

Summary

Now you can collect them in a list and scrape them all at once. Now, I would like to touch on natural language processing this time!

I want to scrape them all together.