Collect CSV data

First, get the CSV data. I was wondering what data to get, but I will scrape the lyrics of my favorite Yorushika.

First, install the modules required for scraping

pip install requests
pip install bs4
pip install lxml
pip install pandas

Scraping!

I referred to here. 【https://qiita.com/yuuuusuke1997/items/122ca7597c909e73aad5#%E3%81%8A%E3%82%8F%E3%82%8A%E3%81%AB】

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#Create a table to put the scraped data
list_df = pd.DataFrame(columns=['lyrics'])

for page in range(10):
    try:
        #Song page top address
        base_url = 'https://www.uta-net.com'

        #Lyrics list page
        artist = "22653"
        url = 'https://www.uta-net.com/artist/'+artist+'/0/' + str(page) + '/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        links = soup.find_all('td', class_='side td1')
        for link in links:
            a = base_url + (link.a.get('href'))

            #Lyrics detail page
            response = requests.get(a)
            soup = BeautifulSoup(response.text, 'lxml')
            song_lyrics = soup.find('div', itemprop='lyrics')
            song_lyric = song_lyrics.text
            song_lyric = song_lyric.replace('\n','')
            #Wait 1 second to not load the server
            time.sleep(1)

            #Add the acquired lyrics to the table
            tmp_se = pd.DataFrame([song_lyric], index=list_df.columns).T
            list_df = list_df.append(tmp_se)
    except:
        print(page)
        import traceback
        traceback.print_exc()

print(list_df)

#csv save
list_df.to_csv('list.csv', mode = 'a', encoding='utf_8_sig')

Installation that requires morphological analysis

First install what you need

pip install "https://github.com/megagonlabs/ginza/releases/download/v1.0.2/ja_ginza_nopn-1.0.2.tgz"
pip install matplotlib
pip install wordcloud

Japaneseization of matplotlib

With reference to this [https://qiita.com/osakasho/items/7408d031ca0b2192422f]

Analysis and graph display!

# coding: utf-8
import spacy
nlp = spacy.load('ja_ginza_nopn')
import pandas as pd
import matplotlib.pyplot as plt
import collections
from wordcloud import WordCloud

def ginza(word):
    doc = nlp(word)
    #Survey results
    total_ls = []
    Noun_ls = [chunk.text for chunk in doc.noun_chunks]
    Verm_ls = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    for n in Noun_ls:
        total_ls.append(n)
    for v in Verm_ls:
        total_ls.append(v)
    return total_ls, Noun_ls, Verm_ls


"""---------------CSV read and pre-set--------------"""
csv_read_path = "list.csv"
df = pd.read_csv(csv_read_path)

target_categories = ["lyrics"]
black_list = ["test"]
"""-------------------------------------------------------------"""



"""---------------Morpheme processing------------------------"""
for target in target_categories:
    total_voc = []#Prepare a box to put letters
    for data in df[target]:
        try:
            word_ls, noun_ls, verm_ls = ginza(data)
        except:#If it cannot be decomposed, use one word.
            word_ls = [data]
        for w in word_ls:
            if not w in black_list:#Check if the word is on the blacklist.
                total_voc.append(w)

    print("The number of words is", len(total_voc), "was.")

    #Ranking the most frequent words
    c = collections.Counter(total_voc)

    #Write to CSV
    c_data = (c.most_common())
    csvdf = pd.DataFrame(c_data)
    filename = target + ".csv"
    csvdf.to_csv(filename, encoding='utf_8_sig')
    print("----------------------------")

    #Graph for the time being
    #Specify an additional partial font.
    plt.rcParams["font.family"] = "IPAexGothic"
    plt.title(target)
    plt.grid(True)
    graph_x_list = []
    graph_y_list = []
    top_num = 0
    for key, value in c.most_common():
        graph_x_list.append(key)
        graph_y_list.append(value)
        if top_num >= 10:
            break
        top_num += 1
    try:
        plt.bar(graph_x_list, graph_y_list)
        #Graph display
        plt.show()
    except:
        print(target, "Could not draw the data.")

    #Draw in WordCloud
    font = 'C:/Windows/Fonts/YuGothM.ttc'
    wordcloud = WordCloud(background_color="white", width=1000, height=600, font_path=font)

    wordcloud.generate(" ".join(wordcloud_ls))
    wordcloud.to_file(target+'.png')

"""-------------------------------------------------------------"""

[Python] From morphological analysis of CSV data to CSV output and graph display [GiNZA]

Collect CSV data

Scraping!

Installation that requires morphological analysis

Japaneseization of matplotlib

Analysis and graph display!

Graph results

Bar chart results

Word Cloud results

You really understand