First, get the CSV data. I was wondering what data to get, but I will scrape the lyrics of my favorite Yorushika.
First, install the modules required for scraping
pip install requests
pip install bs4
pip install lxml
pip install pandas
I referred to here. 【https://qiita.com/yuuuusuke1997/items/122ca7597c909e73aad5#%E3%81%8A%E3%82%8F%E3%82%8A%E3%81%AB】
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
#Create a table to put the scraped data
list_df = pd.DataFrame(columns=['lyrics'])
for page in range(10):
try:
#Song page top address
base_url = 'https://www.uta-net.com'
#Lyrics list page
artist = "22653"
url = 'https://www.uta-net.com/artist/'+artist+'/0/' + str(page) + '/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
links = soup.find_all('td', class_='side td1')
for link in links:
a = base_url + (link.a.get('href'))
#Lyrics detail page
response = requests.get(a)
soup = BeautifulSoup(response.text, 'lxml')
song_lyrics = soup.find('div', itemprop='lyrics')
song_lyric = song_lyrics.text
song_lyric = song_lyric.replace('\n','')
#Wait 1 second to not load the server
time.sleep(1)
#Add the acquired lyrics to the table
tmp_se = pd.DataFrame([song_lyric], index=list_df.columns).T
list_df = list_df.append(tmp_se)
except:
print(page)
import traceback
traceback.print_exc()
print(list_df)
#csv save
list_df.to_csv('list.csv', mode = 'a', encoding='utf_8_sig')
First install what you need
pip install "https://github.com/megagonlabs/ginza/releases/download/v1.0.2/ja_ginza_nopn-1.0.2.tgz"
pip install matplotlib
pip install wordcloud
With reference to this [https://qiita.com/osakasho/items/7408d031ca0b2192422f]
# coding: utf-8
import spacy
nlp = spacy.load('ja_ginza_nopn')
import pandas as pd
import matplotlib.pyplot as plt
import collections
from wordcloud import WordCloud
def ginza(word):
doc = nlp(word)
#Survey results
total_ls = []
Noun_ls = [chunk.text for chunk in doc.noun_chunks]
Verm_ls = [token.lemma_ for token in doc if token.pos_ == "VERB"]
for n in Noun_ls:
total_ls.append(n)
for v in Verm_ls:
total_ls.append(v)
return total_ls, Noun_ls, Verm_ls
"""---------------CSV read and pre-set--------------"""
csv_read_path = "list.csv"
df = pd.read_csv(csv_read_path)
target_categories = ["lyrics"]
black_list = ["test"]
"""-------------------------------------------------------------"""
"""---------------Morpheme processing------------------------"""
for target in target_categories:
total_voc = []#Prepare a box to put letters
for data in df[target]:
try:
word_ls, noun_ls, verm_ls = ginza(data)
except:#If it cannot be decomposed, use one word.
word_ls = [data]
for w in word_ls:
if not w in black_list:#Check if the word is on the blacklist.
total_voc.append(w)
print("The number of words is", len(total_voc), "was.")
#Ranking the most frequent words
c = collections.Counter(total_voc)
#Write to CSV
c_data = (c.most_common())
csvdf = pd.DataFrame(c_data)
filename = target + ".csv"
csvdf.to_csv(filename, encoding='utf_8_sig')
print("----------------------------")
#Graph for the time being
#Specify an additional partial font.
plt.rcParams["font.family"] = "IPAexGothic"
plt.title(target)
plt.grid(True)
graph_x_list = []
graph_y_list = []
top_num = 0
for key, value in c.most_common():
graph_x_list.append(key)
graph_y_list.append(value)
if top_num >= 10:
break
top_num += 1
try:
plt.bar(graph_x_list, graph_y_list)
#Graph display
plt.show()
except:
print(target, "Could not draw the data.")
#Draw in WordCloud
font = 'C:/Windows/Fonts/YuGothM.ttc'
wordcloud = WordCloud(background_color="white", width=1000, height=600, font_path=font)
wordcloud.generate(" ".join(wordcloud_ls))
wordcloud.to_file(target+'.png')
"""-------------------------------------------------------------"""
Thank you for your hard work.
Recommended Posts