Teratail that is often taken care of (https://teratail.com) Occasionally there is a guy who has been left playing for years without being answered at all. I wonder what there are many in terms of categories (tags) I'm thinking of doing good scraping.
What I noticed there is that the last page always ends with 500. The same 500th page is displayed even if you click the "Next" button. It's an endless loop. So I'm going to scrape in a way to avoid it.
This time I will use selenium with Python 3.7.
--Access the unanswered URL "https://teratail.com/feed/not-answered/
No_answered_Tags.py
def main():
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = Options()
options.add_argument('--headless')
browser = webdriver.Chrome(executable_path='/Users/anatanonamae/Desktop/Tool/chromedriver', chrome_options=options)
browser.implicitly_wait(3)
#Access the first page
PAGE = 1
InitURL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
browser.get(InitURL)
print("I accessed the first page")
#Information gathering on each page
TAG_DIC={}
while True:
A_TAG = browser.find_elements_by_tag_name("a")#collect a tag
taglist=[]
for TAG in A_TAG :
HREF = TAG.get_attribute('href') #Collect href
if "tags" in str(HREF):#Collect hrefs containing tags
if not TAG.text:
continue
else:
taglist.append(TAG.text)
for tag in taglist:
if tag in TAG_DIC:
TAG_DIC[tag] += 1
else:
TAG_DIC[tag] = 1
NEXT_XPATH = browser.find_elements_by_xpath("//*[@id=\"mainContainer\"]/div[4]/div/p/a/span[contains(text(),\'Following page\')]")
if NEXT_XPATH:#Add PAGE if there is next
PAGE += 1
else:
print("Got tags at last page.")#If not, it's over
break
browser.get(URL)#Go to next page
WebDriverWait(browser, 2).until(EC.presence_of_all_elements_located)
print(browser.current_url)
if browser.title == "Page Not Found":
print("Got tags at last page.")#If an error occurs on the next page, the process ends.
break
#Post-processing: Creating a Dataframe
df = pd.DataFrame([TAG_DIC.keys(),TAG_DIC.values()],index=None).T#Convert to Dataframe
df.rename(columns={0:"Tag",1:"Count"},inplace =True)#Rename column
df.sort_values(by=['Count'],ascending=False,inplace =True)#Sort in descending order
df.reset_index(drop=True,inplace=True)#Reassign index
print(df)
if __name__ == "__main__":
main()
selenium.py
options = Options()#Selenium option settings
options.add_argument('--headless')#Don't open the window
browser = webdriver.Chrome(executable_path='/Users/anatanonamae/Desktop/Tool/chromedriver', chrome_options=options)#Call driver and set options
browser.implicitly_wait(3)#Wait time setting
access.py
#Access the first page
PAGE = 1
InitURL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
browser.get(InitURL)#Access with get
print("I accessed the first page")#browser.current_You can also view the current page with a url.
find_elements_by_tag_name
--Select the ones that contain "tag" in the href.WebDriverWait (browser, 2) .until (EC.presence_of_all_elements_located)
is the same usage as sleep but more powerful. You can instruct to wait until the page is acquired properly.
For more information: https://qiita.com/uguisuheiankyo/items/cec03891a86dfda12c9a
loop.py
#Information gathering on each page
TAG_DIC={}
while True:
A_TAG = browser.find_elements_by_tag_name("a")#collect a tag
taglist=[]
for TAG in A_TAG :
HREF = TAG.get_attribute('href') #Collect href
if "tags" in str(HREF):#Collect hrefs containing tags
if not TAG.text:#Skip if blank
continue
else:
taglist.append(TAG.text)
for tag in taglist:
if tag in TAG_DIC:
TAG_DIC[tag] += 1#Add if the tag exists
else:
TAG_DIC[tag] = 1#If not, register a new one and set the initial value to 1.
NEXT_XPATH = browser.find_elements_by_xpath("//*[@id=\"mainContainer\"]/div[4]/div/p/a/span[contains(text(),\'Following page\')]")#「Following page」が含まれるelementを検索
if NEXT_XPATH:#Add PAGE if there is next
PAGE += 1
else:
print("Got tags at last page.")#If not, it's over
break
URL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
browser.get(URL)#Go to next page
WebDriverWait(browser, 2).until(EC.presence_of_all_elements_located)
print(browser.current_url)
break.py
if browser.title == "Page Not Found":
print("Got tags at last page.")#If an error occurs on the next page, the process ends.
break
.T
in pd.DataFrame ([TAG_DIC.keys (), TAG_DIC.values ()], index = None) .T
. Convenient Convenient.pandas.py
#Post-processing: Creating a Dataframe
df = pd.DataFrame([TAG_DIC.keys(),TAG_DIC.values()],index=None).T#Convert to Dataframe
df.rename(columns={0:"Tag",1:"Count"},inplace =True)#Rename column
df.sort_values(by=['Count'],ascending=False,inplace =True)#Sort in descending order
df.reset_index(drop=True,inplace=True)#Reassign index
print(df)
With that feeling, the result is ...
result.py
Got tags at last page.
Tag Count
0 PHP 3139
1 Python 2623
2 JavaScript 2428
3 Ruby 1974
4 Python 3.x 1762
5 WordPress 1563
・
・
[1369 rows x 2 columns]
・
・
A whopping 1369 lines came out. "Tag list 501" or the one with a count of 1 or picking up trash The cause was that there were quite a lot. It was okay to delete lines with a count of 100 or less in post-processing. If you want to make it more beautiful, do not register excluded words in the dictionary I think it's okay to make a conditional branch.
That's all for this time.
Recommended Posts