get_ranker_categories.py
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
import csv
# Open Browser
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)
dictionary = "/Users/micksmith/home/work/eBay/Python/word_dictionary.csv"
url_exchange = "/Users/micksmith/home/work/eBay/Python/url_exchange.csv"
def get_categories():
categories_entire = []
categories_entertainment = []
categories_nerdy = []
categories_channel = []
items_entire = driver.find_elements_by_class_name('site__subItem')
items_entertainment = driver.find_elements_by_class_name('-entertainment')
items_nerdy = driver.find_elements_by_class_name('-nerdy')
items_channel = driver.find_elements_by_class_name('-channels')
#print(len(items))
#items = set(items_entire) - set(items_channel)
for item in items_entire:
categories_entire.append(item.get_attribute("textContent"))
for item in items_entertainment:
contents = item.find_elements_by_class_name('site__subItem')
for content in contents:
categories_entertainment.append(content.get_attribute("textContent"))
for item in items_nerdy:
contents = item.find_elements_by_class_name('site__subItem')
for content in contents:
categories_nerdy.append(content.get_attribute("textContent"))
for item in items_channel:
#print(item.get_attribute("textContent"))
contents = item.find_elements_by_class_name('site__subItem')
for content in contents:
categories_channel.append(content.get_attribute("textContent"))
#print(categories_channel)
#print(len(list(set(categories_entire) - set(categories_channel))))
# print(categories_entire)
# print(categories_entertainment)
# print(categories_nerdy)
# print(categories_channel)
return categories_entire, categories_channel
# return (list(set(categories_entire) - set(categories_channel)))
# for item in items:
# categories = item.find_elements_by_class_name('site__subItem')
# for category in categories:
# print(category.get_attribute("textContent"))
# print(item.get_attribute("textContent"))
# categories = item.find_elements_by_class_name('site__subItem')
# for category in categories:
# print(category.text)
def exchange_words(word_dictionary):
print("word_dictionary:", word_dictionary)
word_before = []
word_after = []
word_results = []
with open( dictionary, 'r') as f:
reader = csv.reader(f)
for row in reader:
word_before.append(row[0])
word_after.append(row[1])
for word in word_dictionary:
for num in range(len(word_before)):
if(word == word_before[num]):
word = word_after[num]
print(word)
word_results.append(word)
return word_results
if __name__ == "__main__":
url = "https://www.ranker.com/"
driver.get(url)
entire, channel = get_categories()
items = []
list_of = []
lists = []
tags = []
# list_of = entertainment + nerdy
# tags = list(set(item) - set(list_of))
# list_of = exchange_words(list_of)
# tags = exchange_words(tags)
items = exchange_words(list(set(entire) - set(channel)))
word_list_of = []
word_lists = []
with open(url_exchange, 'r') as f:
reader = csv.reader(f)
for row in reader:
word_list_of.append(row[0])
word_lists.append(row[1])
for item in items:
for i in range(len(word_list_of)):
if(item == word_list_of[i]):
list_of.append(word_list_of[i])
elif(item == word_lists[i]):
lists.append(word_lists[i])
tags = set(items) - set((list_of + lists))
# print("list_of:", list_of)
# print("lists:", lists)
# print("tags:", tags)
# # exchange
for item in list_of:
item = item.replace(" ","-")
url = "https://www.ranker.com/list-of/" + item + "?ref=mainnav"
driver.get(url)
print("URL:", url)
time.sleep(5)
for item in lists:
item = item.replace(" ","-")
url = "https://www.ranker.com/lists/" + item + "?ref=mainnav"
driver.get(url)
print("URL:", url)
time.sleep(5)
for item in tags:
item = item.replace(" ","-")
url = "https://www.ranker.com/tags/" + item + "?ref=mainnav"
driver.get(url)
print("URL:", url)
time.sleep(5)
# print(item)
# print(list_of)
# print(tags)
# for category in categories:
# category = category.replace(" ","-")
# url = "https://www.ranker.com/list-of/" + category + "?ref=mainnav"
# #print("URL:", url)
# #driver.get(url)
# time.sleep(3)
# df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
# df.to_csv(Source_file, index=False)
driver.quit()
url_exchange.csv
"film","albums"
"tv","beverages"
"comics",""
"tech",""
"science",""
"cars",""
"arts",""
"books",""
word_dictionary.csv
"movies","film"
"celebrity","celebrities"
"watchworthy","what to watch"
"anime","anime underground"
"cartoons","animated"
"athletes","best athletes"
"family","parenting"
"career","jobs"
"automotive","cars"
"art","arts"
"deep thoughts","thought provoking"
"libations","alcohol"
"healthy eating","dieting"
Demo
Command
URL: https://www.ranker.com/list-of/arts?ref=mainnav
URL: https://www.ranker.com/list-of/tech?ref=mainnav
URL: https://www.ranker.com/list-of/tv?ref=mainnav
…
URL: https://www.ranker.com/tags/college-sports?ref=mainnav
Since the "hogehoge" of each URL is not uniform, it is classified and corresponded independently (url_exchange.csv).
https://www.ranker.com/hogehoge/category?ref=mainnav
Since the "category" of each URL may differ from the acquired word, create your own conversion table (word_dictionary.csv) to handle it.
https://www.ranker.com/hogehoge/category?ref=mainnav
Recommended Posts