This is the third article in the Spark series. Let's use MLlib's LDA to cluster Yahoo News articles with a topic model (LDA: Latent Dirichlet allocation).
First shot [Machine learning] Start Spark with iPython Notebook and try MLlib http://qiita.com/kenmatsu4/items/00ad151e857d546a97c3 Second edition [Machine learning] Try running Spark MLlib with Python and make recommendations http://qiita.com/kenmatsu4/items/42fa2f17865f7914688d
Please note that this article describes what was done in the above environment, so the settings may differ in other environments.
Python is executed by iPython Notebook (Jupyter).
There is RSS of Yahoo News at http://headlines.yahoo.co.jp/rss/list, so collect links and articles from there.
#Various imports
from bs4 import BeautifulSoup
import requests,json, time
from requests_oauthlib import OAuth1Session
from requests.exceptions import ConnectionError, ReadTimeout, SSLError
import numpy as np
import numpy.random as rd
import MeCab as mc
from collections import defaultdict
import cPickle as pickle
import traceback
from datetime import datetime as dt
IPADIC_NEOLOGD_PATH = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd/'
def unpickle(filename):
with open(filename, 'rb') as fo:
p = pickle.load(fo)
return p
def to_pickle(filename, obj):
with open(filename, 'wb') as f:
pickle.dump(obj, f, -1)
The following is a class that downloads articles from Yahoo News (http://headlines.yahoo.co.jp/rss/list). It takes a lot of time, and since it will access Yahoo News quite a lot, I have already downloaded and prepared the data that has been morphologically analyzed in Mecab, so if you want to try reproducing this article, please use that. Please use it. (Explained in the next section.)
Mecab uses @ overlast's mecab-ipadic-neologd. Previously, the introduction of this Mecab was introduced in [this article](http://qiita.com/kenmatsu4/items/02034e5688cc186f224b#1-1mecab introduction), so please refer to it when you install it.
#Class that picks up articles from Yahoo News
class Category():
def __init__(self, name="", url=""):
self.name = name
self.url = url
self.article_list = []
def addArticle(self, article):
self.article_list.append(article)
class Article():
def __init__(self, title="", contents=u"Unacquired", url=""):
self.url = url
self.title = title
self.contents = contents
self.mecabed_contents = {}
def add_contents(self, contents):
self.contents = contents
def exec_mecab(self):
self.mecabed_contents = Article.mecab_analysis(self.contents)
@staticmethod
def mecab_analysis(sentence):
t = mc.Tagger('-Ochasen -d {}'.format(IPADIC_NEOLOGD_PATH))
sentence = sentence.replace('\n', ' ')
text = sentence.encode('utf-8')
node = t.parseToNode(text)
ret_list = []
while node.next:
if node.surface != "": #Exclude headers and footers
word_type = node.feature.split(",")[0]
if word_type in ["noun", "adjective", "verb"]:
plain_word = node.feature.split(",")[6]
if plain_word !="*":
ret_list.append(plain_word.decode('utf-8'))
node = node.next
return ret_list
DEBUG = True
class YahooHeadlines():
def __init__(self):
self.url = 'http://headlines.yahoo.co.jp/rss/list'
self.category_list = []
self.f = open('log/log_{}.log'.format(dt.now().strftime('%Y%m%d_%H%M%S')), 'a+')
def close(self):
self.f.close()
def logging(self, log):
self.f.write(log.encode('utf-8'))
def unpickle(self, filename):
with open(filename, 'rb') as fo:
p = pickle.load(fo)
self.category_list = p
def pickle(self, filename):
with open(filename, 'wb') as f:
pickle.dump(self.category_list, f, -1)
def download_contents(self):
self.get_category_url_list()
self.get_article_title_list()
self.get_all_article()
def get_url_list(self):
return self._url_list
def set_category_list(self, category_list):
self.category_list = category_list
def get_category_list(self):
return self.category_list
def get_category_url_list(self):
res = requests.get(self.url)
news_all = BeautifulSoup(res.text, "xml")
for link in news_all.find_all('a'):
url = link.get('href')
if 'xml' in url and 'my.yahoo.co.jp' not in url:
self.category_list.append(Category(name=link.parent.text.replace('\n',''), url=url))
if DEBUG:
print "len(self.category_list)", len(self.category_list)
def get_article_title_list(self):
for category in self.category_list:
res = requests.get(category.url)
soup = BeautifulSoup(res.text, "xml")
for item in soup.find_all('item'):
category.addArticle(Article(title=item.title.getText(), url=item.link.getText()))
def count(self):
print "len(self.category_list)", len(self.category_list)
for cat in self.category_list:
print "len(cat.article_list)", len(cat.article_list)
def get_all_article(self, start=0, end=None):
end = len(self.category_list) if end is None else end
for cat in self.category_list[start:end]:
print cat.name
for article in cat.article_list:
try:
print article.title
time.sleep(0.5) # interval time for reducing server load
res = requests.get(article.url)
soup = BeautifulSoup(res.text, "xml")
t = soup.find("p", "ynDetailText")
if len(t.getText()) > 0:
temp = []
for line in t.prettify().split('\n'):
if '<!-- /.paragraph -->' in line:
break
temp.append( line )
article.add_contents(BeautifulSoup("\n".join(temp), "xml").get_text().replace(' ','').replace('\n',''))
article.exec_mecab()
except Exception as e:
print "error."
self.logging(u"{},{}".format(article.url, article.title))
self.logging(traceback.format_exc())
def export(self):
news_list = []
for c in self.category_list:
for a in c.article_list:
if u'Unacquired' != a.contents:
news_list.append(a.mecabed_contents)
return news_list
Download the News article and save it with pickle.
yh = YahooHeadlines()
print "YahooHeadlines() created."
yh.get_category_url_list()
print "get_category_url_list() finished."
yh.get_article_title_list()
print "get_article_title_list() finished."
yh.get_all_article(start=9, end=30)
dat = yh.export()
to_pickle('mecabed_contents.npy', dat)
The analyzed mecabed_contents.npy is stored in the following location on GitHub, so please download and use it. https://github.com/matsuken92/Qiita_Contents/tree/master/LDA_with_Spark
Load the data below.
dat = unpickle('mecabed_contents.npy')
In the LDA calculation, Java OutOfMemory Exception occurred, so increase the Heap size. (I think it depends on the environment in which it is executed, so please set it appropriately)
cd $SPARK_HOME/conf
cp spark-defaults.conf.template spark-defaults.conf
vi spark-defaults.conf
Uncomment only spark.driver.memory
as shown below.
spark-defaults.conf
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
Please also refer to Previous article for starting Spark with iPython Notebook (Jupyter).
Anyway, start Spark.
import os, sys
import pandas as pd
import numpy as np
from datetime import datetime as dt
print "loading PySpark setting..."
spark_home = os.environ.get('SPARK_HOME', None)
print spark_home
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
** Note: The view that executing the topic model with Tf-Idf improves accuracy, and the view that theoretically it is supposed to count words (Tf-Idf becomes float). Please note that it seems that it does not match the theory because it will end up. I will add the normal counting method soon. (See comment section. [Thank you for your comment @ ixofog417.]) **
First, vectorize with Tf-Idf as a preparation for applying the acquired article to LDA.
import pandas as pd
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
hashingTF = HashingTF()
documents = sc.parallelize(dat)
def hashing(x):
return hashingTF.transform([x]).indices[0]
hashed = documents.flatMap(lambda line: line) \
.map(lambda word: (hashing(word), word)).distinct()
hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash')
hashed_word
has the following data. Later, I want to extract the word from the hash value, so I will make it a table.
word | |
---|---|
hash | |
605 | Invited guests td> |
342707 | Gambler td> |
578741 | Flower Companies td> |
445743 | Dogo Onsen td> |
599361 | BURBERRY |
520201 | Tokyo Game Show td> |
... | ... |
735678 | Omit td> |
56058 | Security td> |
444490 | Tsukiko td> |
706206 | GENERATIONS |
267402 | Coupe td> |
41261 rows × 1 columns
Calculate the Tf-Idf value with Spark and generate the converted RDD so that it can be read by LDA.
# Tf-Idf generation
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF().fit(tf)
tf_idf_data = idf.transform(tf)
Run LDA with Spark MLlib. For the time being, set k = 30. The challenge is how to determine the number of topics. This time I finally decided on the ray, so I think there is a better value. (If you know how to decide k, please tell me!)
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
print dt.now().strftime('%Y/%m/%d %H:%M:%S')
K = 30
# Index documents with unique IDs
corpus = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
# Cluster the documents into three topics using LDA
%time ldaModel = LDA.train(corpus, k=K)
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
%time topics = ldaModel.topicsMatrix()
print dt.now().strftime('%Y/%m/%d %H:%M:%S')
I'm running on a weak "new Macbook" and this calculation took about 12 minutes. You can do it: smile:
out
2015/09/20 17:31:17
CPU times: user 6.34 ms, sys: 2.09 ms, total: 8.44 ms
Wall time: 30.8 s
Learned topics (as distributions over vocab of 1048576 words):
CPU times: user 5min 14s, sys: 6min 12s, total: 11min 26s
Wall time: 11min 53s
2015/09/20 17:43:42
The result calculated in the previous section is output.
def idx_to_word(idx):
res = hashed_word.ix[idx].word
if type(res) == pd.Series:
return res.to_dict().values()[0]
else:
return res
rep_num = 20
for topic in range(K):
print("Topic " + str(topic) + ":")
temp_w = []
temp_t = []
for word in range(0, ldaModel.vocabSize()):
top = topics[word][topic]
if top != 0:
#print("{}:{}".format(word, top))
temp_w.append(word)
temp_t.append(top)
temp_w = np.array(temp_w)
temp_t = np.array(temp_t)
idx = np.argsort(temp_t)[::-1]
print ','.join(map(idx_to_word, temp_w[idx[:rep_num]]))
print temp_t[idx[:rep_num]]
Below are the classification results. The top 20 words from each topic are displayed. It feels like that, but it still needs tuning. It's not clear what each topic refers to. Is it how to decide k?
out
Topic 0:
3D,1%,JP,Yahoo,co.jp,http://,2Z,FE,TC,WC,JavaScript,SRC,ALT,D2,Minutes,.S,SIG,clear,Mi,GIF
[ 30498.99621439 6067.97495307 5638.31180986 4239.90976107
3839.63866955 3620.87671019 2048.76800459 2035.55013512
2035.55013512 2035.55013512 1903.02711354 1898.96547573
1820.93929181 1763.1621581 1724.74815005 1688.15876657
1613.83355369 1483.59938276 1454.82128817 1338.48860166]
Topic 1:
Deep learning,GPU,Insolvency,Nuclear power plant,Ajin,Operation,Kyocera,Calculation,Hamada,Japan Inter,Recommendation,Mr. Murakami,Woodward,Library,Buying,ABC,DI,Utilization,Preventive medicine,Net worth
[ 230.26782221 222.54019498 109.0725775 86.27167445 86.10057908
84.22603202 67.68409895 66.99081298 60.91536464 57.4006148
57.16789412 50.24346965 50.17063652 45.16572514 43.57092785
43.37177773 43.06492631 41.84250571 40.60449032 39.60700784]
Topic 2:
sound,Professor,Amplifier,Proceedings,speaker,Suzuki,University,Communist Party,A-10,DR,(stock),SUZUKI,Mr.,Capital,HONDA,Itano,Mr,Recording,Internet,band
[ 313.9497848 311.07373468 291.18216703 200.41036663 174.99267573
168.83426043 162.12249119 160.4631899 158.44550237 155.86272676
152.51636208 145.63724853 144.22014499 143.88263097 138.80529834
136.38362019 133.87558279 132.8150622 127.1633457 123.42496755]
Topic 3:
Dolphin,patient,Carry-in,Navy,Membership,米Navy,Strategy,lawyer,Shopkeeper,train,Shunga,Goku ZERO,Military,Fukuoka,Agreement,Group,beauty and the Beast,Rank,emblem,update
[ 285.35384105 125.29445731 122.03394224 117.37108065 114.56787287
107.67685141 107.66792085 107.49265658 104.77371348 104.55689386
103.34343411 101.54959522 99.13195887 97.66056425 87.6906483
83.77795736 82.83739301 82.06384181 81.99063074 79.61260345]
Topic 4:
Ogasawara Islands,19th,rain,NARUTO-Naruto-,Prospect,tomorrow,Place,Chocobo,Today,fields,Typhoon No. 20,Sediment disaster,Ofuji,Luna,weapon,Very,Station,is there,Hmm,Tohoku
[ 230.41298471 206.73983243 201.38377462 162.53955457 156.01089213
152.26626716 147.20327527 143.56116858 138.58499586 136.35519513
134.63602579 131.89025362 122.02553338 114.84698842 114.73039984
112.58882552 111.19144156 109.29280382 108.74278871 108.06638723]
Topic 5:
shop,LGBT,Rural,Autumn leaves,Hanyu,Everest,Parties,USJ,MM,welding,NorAh,Kushiro,player,baseball,Man,Abe,Toi,loss,Muraki,Fukamachi
[ 534.02134183 233.21159627 161.734613 149.27499135 148.04072853
139.83024817 128.12607155 127.16365004 121.55663036 116.93175677
115.10536063 111.9230136 108.32928292 101.01309412 99.57305727
97.8645909 93.31870841 90.55202246 88.16103482 85.11086582]
Topic 6:
Self-Defense Force,The relevant,Activities,Item,Implementation,rescue,etc,support,Regulations,Goods,Article,Cooperation,Measure,Unit,Services,search,situation,Offer,two,army
[ 425.27200701 410.14147759 340.63660257 335.99268066 301.03835559
277.69844718 262.99789699 244.04626438 241.86903535 233.56945124
226.29603529 213.94031937 208.31405209 198.09771261 191.92479361
173.18290576 171.56092092 164.69617574 147.1031081 144.02472698]
Topic 7:
Hmm,think,さHmm,Man,dance,Go,Venue,Yo,do,player,word,Song,stage,Become,create,come,Appearance,member,thing,Female
[ 400.87252109 311.02748052 250.83203469 243.87087686 241.62681685
235.1485944 219.71001515 212.56170962 206.76164473 198.28774766
190.64751854 190.09850913 187.53964957 178.53456693 173.1581961
170.93611348 167.90595764 166.71680877 163.85967037 160.64966047]
Topic 8:
Lee Seung Gi,Cheap smartphone,Endo,Police box,Architecture,design,SHEENA,Carla,team Dempa.inc,height,Joe,Construction,Chidori,Eve,Christian,Kura,Inoculation,Case study,Special treatment,competition
[ 122.01843494 100.42493188 96.7819965 90.82173926 84.67996554
84.04629268 81.2426219 81.22826354 79.28066538 77.10645017
75.3958751 70.7937157 67.79664672 67.62432926 62.02688985
61.12174747 60.911537 60.671785 60.6691196 59.22618216]
Topic 9:
Ishihara,Yamashita,Month 9,Nuclear power plant,Total,MAX,Mr.,Alibaba,won,Kawajima,love,Monk,Ten thousand,Successful bid,,,Takamine,Role,starring,坊Mr.,Ministry of Employment and Labor
[ 251.21702545 246.98644992 188.33673645 180.99682139 170.83125774
161.27898596 150.18861226 148.37545664 145.26656891 116.99233982
115.97102397 111.61849001 108.61185273 108.09102905 104.38739566
103.32743846 96.51126917 95.40721995 95.33654317 94.80918496]
Topic 10:
Right of collective self-defense,exercise,security,Japan,remains,bill,North Korea,GO,system,Prime Minister,Rice,Pokemon,Korea,peace,war,Thinking,Meeting,Established,Kobayashi,Telomeres
[ 325.4190462 294.03767623 253.6549491 215.81603062 212.85361125
212.4241334 203.16256149 145.41407277 145.35949337 144.77378143
140.99962347 135.45572385 131.0855378 121.75771794 118.79648391
117.21162034 115.63520103 115.03735685 115.02058923 114.84203109]
Topic 11:
1,0,2,3,5,4,9,8,bill,6,Day,7,Committee member,Year,Opposition,Diet,%,Draft,Congressman,Ruling party
[ 2365.0615964 1843.50976149 1580.14166211 977.45697796 972.93295992
900.33510929 811.76679401 734.30895952 708.8845634 687.91169097
666.9871633 638.37414039 480.65198962 403.9740617 397.36591408
389.03843978 378.11150102 372.94260471 366.06518175 348.52658829]
Topic 12:
%,week,Man,tea,Labor,Ogiso,Yamaguchi-gumi,Kuwahara,cork,Dispatch,Investigation,Examination,Visit to Japan,Saenuri Party,fan,Answer,Extension,period,Ten thousand,Mr.
[ 422.50645087 213.35496472 190.18553723 185.25693 172.87477417
169.32178049 168.65380074 168.60270933 165.10107941 163.39675225
158.10205955 157.84657732 156.61876499 150.94891424 144.86004174
142.60856342 141.41821081 139.14405814 136.07482269 129.11386079]
Topic 13:
Pets,tire,inspection,Fujifilm,shop,dog,accident,Owner,Bread,Quantitative easing,bubble,February,Archer,organ,animal,business,ELEMENT,Closed,Petsシッター,Take care
[ 144.38505303 139.38328152 138.65459253 120.09515611 117.32842831
111.2811561 97.34985563 90.9386823 88.76830528 86.09862267
86.03676176 81.16131412 73.04842045 71.94537875 71.76221994
69.36573458 67.72482177 67.56636611 64.59788527 63.72988257]
Topic 14:
debris,space,satellite,Okano,S.M.A.R.T,Yasu,Amber Heard,photon,Kinoshita Hanta,beam,Removal,pasta,Lovely Doll ★ DOLL,Ninomiya,rocket,player,collision,construction,Plating,Shinohara
[ 200.98746196 109.11393056 102.69563054 71.64443048 70.61628478
70.21806077 69.47009154 67.71824577 64.58911369 63.98653636
61.75894589 57.1558711 54.17379175 50.53475054 50.08003639
49.38497398 49.1474643 48.05613337 47.37467689 47.21593097]
Topic 15:
train,shop,Railroad,station,transport,Spouse deduction,Ten thousand,Circle,prize,Larva,defendant,gene,Flood damage,medicine,Debris,Disposal,education,JR Freight,Practice,system
[ 209.45549658 172.75201923 164.79055902 147.02460723 146.34946295
122.11417714 116.53446688 113.36476153 110.00093014 101.51355757
101.49522834 93.61766945 90.44254789 90.21005366 86.14087176
85.94118974 85.87426669 83.81989617 81.4445114 81.32144707]
Topic 16:
Song,Written,Mr.,Written品,Event,stage,Release,Performance,live,Hmm,release,Appearance,album,fan,Show off,Release,Held,think,Recording,Venue
[ 717.72208948 701.88650132 675.57536789 653.80324063 630.25795307
623.56413175 593.77778162 570.85401227 542.29065168 530.72760902
527.34422729 504.12104195 477.59137972 477.00323092 449.362484
433.71529537 424.21385561 415.6621296 413.39032883 408.44365814]
Topic 17:
Immigrants,Mr,589 Croatia,action,Prime Minister,Byakkotai,Send,Stoker regulation law,Kwon Sang Woo,Germany,Tsukimito,turn,Bull,Border,Abbott,Leaders,Hungary,Et al.,In the jurisdiction,e-mail
[ 164.44142649 157.91328715 138.76814858 132.5043004 125.07620334
114.82154418 112.98085344 108.36476034 100.36013718 99.44524733
95.72254509 91.79868319 89.07727008 83.49107233 81.37738585
78.16457362 77.45463275 77.03517754 75.47489877 74.73847572]
Topic 18:
%,beer,Billion,Ten thousand,Raleigh,Liquor tax,Increase,Previous year,Rank,Decrease,Circle,Investigation,For,Company,ratio,service,market,Books,POSCO,Trillion
[ 580.21824689 434.53747304 337.23060498 322.90011084 275.51253012
255.35439791 202.94575502 195.40863404 193.2023368 188.88153369
188.32713027 185.3074174 182.46872612 180.38548978 168.37490369
159.71109053 159.65702647 155.00164055 150.38902564 149.40071569]
Topic 19:
Okamura,Give,Ishibashi,Positive,Polarization,Okamura隆史,Break dance,Criminal,sunglasses,Touch panel,you,refugees,lead,Home party,receiving,Father,pharmacist,Basilica,pharmacy,三菱lead筆
[ 77.67608384 65.66168235 62.59137271 61.50991922 50.18323397
44.41180978 43.50803013 41.09367176 40.73945738 38.9101876
37.57614659 36.56843092 35.85623378 35.81638016 34.10640826
33.81327369 32.32619825 31.22516758 31.12976321 30.34057197]
Topic 20:
Briquettes,Negotiation,Akiko Wada,Okinawa,sweet sake,Human resources,of,Avigan,Labor regulations,Obon ball,Meeting,Theme park,Key,serendipity,New Year's present,USJ,PIN,cell,Minister,convenience store
[ 200.98686962 154.40963453 106.75322346 102.73754422 100.48163455
98.9612829 94.85889131 93.31730072 93.30796905 93.27433467
92.84230214 89.15912225 87.60003563 86.13875558 86.09579478
81.48415665 81.37494046 81.10648568 75.53083854 74.76190319]
Topic 21:
18th,Coast,Japan Meteorological Agency,Shin-Hakodate Hokuto,Shinkansen,round trip,Opening,island,Rubber,2015,Hokkaido,Hawaii,First visit,父island,SAKANAMON,VAMPS,Appearance,Presentation,3M,Observation
[ 326.61966201 176.18179227 162.70899568 137.89819305 135.61061726
131.91446936 127.87583916 123.18162869 119.46292987 114.89846676
113.33026617 108.85661384 96.44435409 94.0825422 93.31173974
92.48630364 90.34013265 89.33794268 89.00557891 88.60743728]
Topic 22:
Kiritani,RC,Sakaguchi,MT,Bird,Greece,Heroine disqualification,Star Wars,Yamazaki,musics,Freeze,Hiromitsu,AWA,Nebuta,Original,OB,T cells,Mr. M,Evacuation,Park Sol Mi
[ 242.08396669 233.61062923 172.28879872 158.02400752 156.16092615
149.65020403 145.38706775 143.01353797 123.89388685 107.61948489
105.20201675 104.23176854 103.93186096 101.57317097 101.33211206
98.35838535 93.31294228 81.26331036 78.87903503 77.78473071]
Topic 23:
Tax accountant,Circle,Ten thousand,Declaration,export,amount,tax,Rank,My number,Office,income,system,%,Philippine,If,Billion,Electric power,Company,Home electronics mass retailer,thing
[ 670.6061898 559.30722115 395.94196364 369.03793975 352.9802148
350.59584008 348.81817142 345.42194256 281.01115977 270.7837518
268.64882097 263.68902183 256.54739477 233.11666127 228.29591629
224.91966604 208.54269702 206.95435942 201.05969014 199.71772628]
Topic 24:
of,Constitution,Company,thing,Japan,Person,Nation,Yuichi Kimura,lawyer,it can,business,design,development of,Yo,think,is there,Say,power,sex,Think
[ 371.66961434 337.03124549 319.99104269 319.594891 309.51245673
287.52866308 271.19087899 267.75333312 261.60521555 256.02307667
251.18894465 239.58136963 238.33242359 238.07787656 233.68552111
231.93864718 213.6720825 207.06572415 206.83553817 206.39025416]
Topic 25:
damage,Billion,Store,operation,Circle,Fee,passenger,Helicopter,0,Increase,Agriculture,Decrease,Previous year,AKB48,Miyagi,Opening a store,Prefecture,shop,Ten thousand,while
[ 322.28929388 284.37384142 264.46206604 248.44913769 226.60800063
226.41660568 212.16654388 205.88384117 189.18011081 173.35857685
170.73582962 170.16262181 167.13947269 166.91143061 165.98762565
164.64467713 157.49179255 153.26181924 149.68685887 145.6529475 ]
Topic 26:
Rate hike,China,market,Economy,USA,Dollar,km,%,Ahn,Rise,Machine,interest rate,Business,stock,Outlook,Fall,Circle,investment,rate,Korea
[ 711.44316161 691.81953214 624.21582824 603.1447681 464.88853934
444.72254696 425.1654548 400.24353915 398.08670081 384.38514657
378.64702088 364.08566045 354.84095879 354.60928052 346.69708409
337.14563576 335.09073391 331.251988 328.37760334 316.68760744]
Topic 27:
movies,directed by,play,Hmm,I,Actor,Role,age,Written,Antman,さHmm,Appearance,jobs,stage,Drama,photograph,Book,actress,think,thing
[ 886.18859913 521.81885818 517.66295551 341.28837968 323.889684
320.54609403 318.78269341 305.49616021 292.69106111 291.83105713
283.59914761 271.24734272 271.03094368 266.13209765 257.9348965
252.86535054 245.73361042 241.71909116 225.00245517 222.13685278]
Topic 28:
game,it can,powered by,Recruitment,development of,of,thing,Yuichi Kimura,for,Stand,Mr,Refund,利for,To,China,sex,Yo,Product,Become,smartphone
[ 453.00001367 302.95432162 283.96542019 280.46414245 257.18675974
254.89400232 246.43778386 219.71661031 217.78910865 214.12011552
212.66757085 211.03349157 205.35032129 203.34111497 197.81430578
193.73396761 193.32616187 190.05730112 189.02413711 187.26200727]
Topic 29:
Asakusa,Others,comedy,Hikari Club,Lychee,Canon,Nakajo,Etc.,Taylor,Film festival,Takeshi,Town,Tsugaru shamisen,I,Performance,Taitung,Joe Hisaishi,charging,Takarazuka,JR Kyushu
[ 170.36663986 156.09380245 132.93872491 127.17520086 127.13453875
112.71315236 110.24371137 107.89145147 106.67342349 102.47261177
99.54801093 93.6074624 90.90080501 85.36814206 79.75410095
79.31855725 78.95649479 76.60922126 74.76350455 74.69475118]
There are the following issues. If you have any knowledge, I would be grateful if you could give me some advice.
Spark 1.5.0 Machine Learning Library (MLlib) Guide http://spark.apache.org/docs/latest/mllib-guide.html
MLlib - ClusteringLatent Dirichlet allocation (LDA) http://spark.apache.org/docs/latest/mllib-clustering.html#latent-dirichlet-allocation-lda
Recommended Posts