Get the book information from the O'Reilly Japan website, Let's classify books by non-hierarchical clustering from the acquired information. The procedure is as follows. ・ Access the detailed information of the book from the top page of the Web, Get the text of this introduction in a list ・ For each book, the text of this introduction is decomposed into word levels and each word is weighted. ・ Based on the above information, classify books by clustering. The language uses Python.
clustering.py
#coding:utf-8
import numpy as np
import mechanize
import MeCab
import util
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
# get O'Reilly new books from Top page
page = mechanize.Browser()
page.open('http://www.oreilly.co.jp/index.shtml')
response = page.response()
soup = BeautifulSoup(response.read(), "html.parser")
allBookLinks = []
bibloLinks = soup.find_all("p", class_="biblio_link")
for bibloLink in bibloLinks:
books = bibloLink.find_all("a", href=re.compile("http://www.oreilly.co.jp/books/"))
for book in books:
allBookLinks.append( book.get("href") )
clustering.py
def get_detail_sentence_list( detailPageLink ):
page.open( detailPageLink )
detailResponse = page.response()
detailSoup = BeautifulSoup( detailResponse.read(), "html.parser" )
# get title
titleTag = detailSoup.find("h3", class_="title")
title = titleTag.get_text().encode('utf-8')
# get detail
detailDiv = detailSoup.find("div", id="detail")
detail = detailDiv.find("p").get_text().encode('utf-8')
# get relation book links
relationLinks = detailDiv.find_all("a")
relationLinkList = []
for relationLink in relationLinks:
href = relationLink.get("href")
if href.find('/books/') > 0:
relationLinkList.append(href[href.find('/books/') + len('/books/'):])
return [ title, detail, relationLinkList ]
# crolling books info
titleList = []
inputDatas = []
for bookLink in allBookLinks:
title, detail, relationLinkList = get_detail_sentence_list( bookLink )
# save
if not (title in titleList):
titleList.append(title)
inputDatas.append( detail )
# go to relation book links
for relationLink in relationLinkList:
title, detail, relationLinkList = get_detail_sentence_list( 'http://www.oreilly.co.jp/books/' + relationLink )
# save
if not (title in titleList):
titleList.append(title)
inputDatas.append( detail )
The contents of X using TfidfVectorizer are ・ Len (X) = number of books searched ・ Len (X [0]) = number of words in the introductory text of the book -X [0] [0] = 0 TF-IDF value of the 0th word (word stored in terms [0]) of the 0th book A procession like that. You can calculate TF-IDF with logic, but it's easier to use this library.
clustering.py
def get_word_list( targetText ):
tagger = MeCab.Tagger()
wordList = []
if len(targetText) > 0:
node = tagger.parseToNode(targetText)
while node:
if len(util.mytrim(node.surface)) > 0:
wordList.append(node.surface)
node = node.next
return wordList
tfidfVectonizer = TfidfVectorizer(analyzer=get_word_list, min_df=1, max_df=50)
X = tfidfVectonizer.fit_transform( inputDatas )
terms = tfidfVectonizer.get_feature_names()
util.py
#coding:utf-8
def mytrim( target ):
target = target.replace(' ','')
return target.strip()
I tried it with both K-means and Affinity Propagation. K-means is used when it is decided how many pieces to classify first, If you haven't decided, Affinity Propagation works pretty well. I think Affinity Propagation was more suitable in this case.
clustering.py
# clustering by KMeans
k_means = KMeans(n_clusters=5, init='k-means++', n_init=5, verbose=True)
k_means.fit(X)
label = k_means.labels_
clusterList = {}
for i in range(len(titleList)):
clusterList.setdefault( label[i], '' )
clusterList[label[i]] = clusterList[label[i]] + ',' + titleList[i]
print 'By KMeans'
for key, value in clusterList.items():
print key
print value
print 'By AffinityPropagation'
# clustering by AffinityPropagation
af = AffinityPropagation().fit(X)
afLabel = af.labels_
afClusterList = {}
for i in range(len(titleList)):
afClusterList.setdefault( afLabel[i], '' )
afClusterList[afLabel[i]] = afClusterList[afLabel[i]] + ',' + titleList[i]
for key, value in afClusterList.items():
print key
print value
It looks like that!
Recommended Posts