I scraped from some URLs in Ruby and extracted topics using LDA in Python.
gem install
$ bundle init
$ vim Gemfile
gem 'mechanize'
$ bundle install
It is OK if it works with the following sample file.
sample.rb
require 'mechanize'
agent = Mechanize.new
search_page = agent.get('Appropriate URL')
search_page.search('body p').each do |y|
p y.text
end
$ brew search mecab
mecab mecab-ipadic
$ brew install mecab mecab-ipadic
$ mecab
OK if mecab starts
natto is a gem that wraps mecab installed on your system.
gem install
$ bundle init
$ vim Gemfile
gem 'natto'
$ bundle install
In order to use natto, you need to specify an environment variable called MECAB_PATH.
$ find /usr/ -name "*mecab*" | grep dylib
$ export MECAB_PATH=/usr//local/Cellar/mecab/0.996/lib/libmecab.dylib
http://yatta47.hateblo.jp/entry/2015/12/13/150525 https://github.com/buruzaemon/natto
It is OK if it works with the following sample file.
sample.rb
require 'natto'
text = 'Of the thighs and thighs'
nm = Natto::MeCab.new
nm.parse(text) do |n|
puts "#{n.surface}\t#{n.feature}"
end
http://qiita.com/shizuma/items/d04facaa732f606f00ff http://d.hatena.ne.jp/otn/20090509
It should be made originally, but omitted this time.
This time, instead, we exclude nouns, general pronouns and non-independence.
cond1 = features.include?('noun')
cond2 = features.include?('General')
cond3 = !features.include?('Pronoun')
cond4 = !features.include?('Non-independent')
if cond1 && cond2 && cond3 && cond4
#Required processing
end
Data is exchanged between python and ruby using json. Specifically, prepare a csv that summarizes the URL of the target page as shown below, scrape it from there, and convert it to the data structure required for LDA.
url |
---|
URL1 |
URL2 |
... |
URLN |
Finally, the following array with words arranged for each document is generated and output as json.
[
['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']
]
http://tohka383.hatenablog.jp/entry/20111205/1323071336 http://peaceandhilightandpython.hatenablog.com/entry/2013/12/06/082106
gem 'mechanize'
gem 'natto'
#A class that generates an array of URLs from csv
class UrlGetService
require 'csv'
def initialize(csv_path)
@csv_path = csv_path
end
def web_urls
@web_urls ||= -> do
rows = []
csv_file.each_with_index do |row, index|
unless index == 0
rows << row[0]
end
end
rows
end.call
end
private
attr_reader :csv_path
def csv_file
@csv_file ||= -> do
csv_text = File.read(csv_path)
CSV.parse(csv_text)
end.call
end
end
#A class that scrapes a given URL
class WebScrapingService
require 'mechanize'
def initialize(url)
@url = url
end
def texts
@texts ||= -> do
texts = ''
page_contents.each do |content|
texts += content.text
end
texts
end.call
end
private
attr_reader :url
def page_contents
@page_contents ||= scraping_agent.get(url).search('body p')
end
def scraping_agent
@scraping_agent ||= Mechanize.new
end
end
#A class that morphologically parses scraping results and creates an array of words
class MorphologicalAnalysisService
require 'natto'
`export MECAB_PATH=/usr//local/Cellar/mecab/0.996/lib/libmecab.dylib`
def initialize(texts)
@texts = texts
end
def words
words = []
morphological_analysis_agent.parse(texts) do |word|
features = word.feature.split(/,/)
cond1 = features.include?('noun')
cond2 = features.include?('General')
cond3 = !features.include?('Pronoun')
cond4 = !features.include?('Non-independent')
if cond1 && cond2 && cond3 && cond4
words << word.surface
end
end
words
end
private
attr_reader :texts
def morphological_analysis_agent
@morphological_analysis_agent ||= Natto::MeCab.new
end
end
#Class that dumps JSON using 3 classes
class DictionaryOutputService
require 'json'
def initialize(csv_path)
@csv_path = csv_path
end
def output_json
open('sample.json', 'w') do |f|
JSON.dump(words_array, f)
end
end
private
attr_reader :csv_path
def words_array
@words_array ||= -> do
web_urls.each_with_object([]) do |url, arr|
texts = WebScrapingService.new(url).texts
words = MorphologicalAnalysisService.new(texts).words
white_lists = words.inject(Hash.new(0)) { |h, a| h[a] += 1; h }.select { |_, c| c > 1 }.map { |w, _| w }
arr << words.select { |w| white_lists.include?(w) }
end
end.call
end
def web_urls
UrlGetService.new(csv_path).web_urls
end
end
#Execute as follows
csv_path = "YOUR_CSV_PATH/file_name.csv"
DictionaryOutputService.new(csv_path).output_json
Instead of using the system python as it is, use the installed and versioned python.
git clone https://github.com/yyuu/pyenv.git ~/.pyenv
~/.bashrc
export PYENV_ROOT=$HOME/.pyenv
export PATH=$PYENV_ROOT/bin:$PATH
eval "$(pyenv init -)"
If it is 3.5 series, you can not fall by installing gensim.
sourve ~/.bashrc
pyenv install 3.5.0
pyenv shell 3.5.0
http://qiita.com/Kodaira_/items/feadfef9add468e3a85b
To do LDA with python, use a module called gensim. setuptools required for gensim installation
sudo easy_install -U setuptools
Install gensim. Also update dependent tools such as numpy.
sudo -H pip install gensim -U
lda.py
from gensim import models, corpora
if __name__ == '__main__':
#Originally, this texts reads JSON files etc.
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
# Topics
for topic in lda.show_topics(-1):
print('topic')
print(topic)
# Topic of each document
for topics_per_document in lda[corpus]:
print('topic of ecah document')
print(topics_per_document)
https://radimrehurek.com/gensim/tut1.html#corpus-formats https://openbook4.me/projects/193/sections/1154 http://sucrose.hatenablog.com/entry/2013/10/29/001041
#Required Package Army
install.packages("lda")
install.packages("ggplot2")
install.packages("reshape2")
#Free data
data(cora.documents)
data(cora.vocab)
##Number of topics
K <- 10
#Function execution
result <- lda.collapsed.gibbs.sampler(cora.documents,
K, #Number of topics
cora.vocab,
25, #Number of samplings
0.1, #Hyper parameter α
0.1, #Hyper parameter β
compute.log.likelihood=TRUE)
#Top 5 Frequent Words by Topic
top.words <- top.topic.words(result$topics, 5, by.score=TRUE)