Can a decision tree (classification tree) be implemented and the model correctly classify the dataset? If you want to check, use it as a CSV file to be loaded into the model. Features can be set arbitrarily when creating CSV, so if the model can classify the features, It can be determined that the model is working properly.
1: Office worker 2: Company officer 3: Self-employed 4: Freelance 5: Student 6: Housewife 7: Unemployed 8: Other
For example, I want to generate the following data set.
In this example The customer on the first line is A man, 55 years old, married, living in Hyogo prefecture, an office worker, a hobby of motorcycles, and a customer who bought a product.
The customer on the second line A female, 31-year-old, unmarried, living in Wakayama prefecture, an office worker, a hobby of automobiles, and a customer who has not purchased any products.
The left and right tables are the same data set. The table on the left shows each value in Japanese. The table on the right shows each value numerically.
Given that there is such a customer list A case where you want to know the characteristics (trends) of the person who bought the product. Which one buys more by gender? Is there a difference depending on the area where I live? Is age or occupation relevant? If you know the tendency of "what kind of person bought it?", It will be useful for marketing measures. For example, when sending a new direct mail, in order to maximize the response rate, Should I limit the destination area? Should I narrow down by age or gender? It becomes easier to judge.
import random
import csv
##################################################
#Define master data.
##################################################
def getMstHobby():
mst = []
mst.append('hobby')
mst.append('reading')
mst.append('movies')
mst.append('musics')
mst.append('Outdoor')
mst.append('Automobile')
mst.append('motorcycle')
mst.append('Video game')
return mst
def getMstJob():
mst = []
mst.append('Profession')
mst.append('employee')
mst.append('Company executive')
mst.append('self employed')
mst.append('Freelance')
mst.append('student')
mst.append('housewife')
mst.append('Unemployed')
mst.append('Other')
return mst
def getMstGender():
mst = []
mst.append('Female')
mst.append('male')
return mst
def getMstMarriage():
mst = []
mst.append('Unmarried')
mst.append('married')
return mst
def getMstPref():
mst = []
mst.append('Prefectures')
mst.append('Hokkaido')
mst.append('Aomori Prefecture')
mst.append('Iwate Prefecture')
mst.append('Miyagi Prefecture')
mst.append('Akita')
mst.append('Yamagata Prefecture')
mst.append('Fukushima Prefecture')
mst.append('Ibaraki Prefecture')
mst.append('Tochigi Prefecture')
mst.append('Gunma Prefecture')
mst.append('Saitama')
mst.append('Chiba')
mst.append('Tokyo')
mst.append('Kanagawa Prefecture')
mst.append('Niigata Prefecture')
mst.append('Toyama Prefecture')
mst.append('Ishikawa Prefecture')
mst.append('Fukui prefecture')
mst.append('Yamanashi Prefecture')
mst.append('Nagano Prefecture')
mst.append('Gifu Prefecture')
mst.append('Shizuoka Prefecture')
mst.append('Aichi prefecture')
mst.append('Mie Prefecture')
mst.append('Shiga Prefecture')
mst.append('Kyoto')
mst.append('Osaka')
mst.append('Hyogo prefecture')
mst.append('Nara Prefecture')
mst.append('Wakayama Prefecture')
mst.append('Tottori prefecture')
mst.append('Shimane Prefecture')
mst.append('Okayama Prefecture')
mst.append('Hiroshima Prefecture')
mst.append('Yamaguchi Prefecture')
mst.append('Tokushima Prefecture')
mst.append('Kagawa Prefecture')
mst.append('Ehime Prefecture')
mst.append('Kochi Prefecture')
mst.append('Fukuoka Prefecture')
mst.append('Saga Prefecture')
mst.append('Nagasaki Prefecture')
mst.append('Kumamoto Prefecture')
mst.append('Oita Prefecture')
mst.append('Miyazaki prefecture')
mst.append('Kagoshima prefecture')
mst.append('Okinawa Prefecture')
return mst
##################################################
#Receive the list and output it to a csv file.
##################################################
def outputCsv(fileName, listData):
f = open(fileName, 'w')
writer = csv.writer(f, lineterminator='\n')
writer.writerows(listData)
f.close()
##################################################
#Set the characteristics of the dataset.
##################################################
def setFeatures(gender, age, marriage, pref, job, hobby):
if gender == 'male' and age >= 40 and job == 'employee':
return 'yes'
if gender == 'Female' and age <= 29:
return 'yes'
if marriage == 'married' and job == 'employee':
return 'yes'
return 'no'
##################################################
#Start processing.
##################################################
mst_gender = getMstGender()
mst_pref = getMstPref()
mst_job = getMstJob()
mst_hobby = getMstHobby()
mst_marriage = getMstMarriage()
users_label = []
users_int = []
#CSV file header.
csv_header_en = ['gender', 'age', 'marriage', 'pref', 'job', 'hobby', 'y']
csv_header_jp = ['sex', 'age', 'marriage', 'Prefectures', 'Profession', 'hobby', 'y']
users_int.append(csv_header_en)
users_label.append(csv_header_jp)
#The number of CSV records to create.
recordNum = 50000
#Generate a CSV record.
for num in range(recordNum):
ageInt = random.randint(20, 80) #Minimum age and maximum age
prefInt = random.randint(1, 47) #Prefecture number. 1 in Hokkaido,13 in Tokyo
prefLabel = mst_pref[prefInt]
genderInt = random.randint(0, 1) # 0:Female, 1:male
genderLabel = mst_gender[genderInt]
marriageInt = random.randint(0, 1) # 0:Unmarried, 1:married
marriageLabel = mst_marriage[marriageInt]
jobInt = random.randint(1, 7) #Occupations other than "Other"
jobLabel = mst_job[jobInt]
hobbyInt = random.randint(1, 7)
hobbyLabel = mst_hobby[hobbyInt]
y = setFeatures(genderLabel, ageInt, marriageLabel, prefLabel, jobLabel, hobbyLabel)
dataInt = [genderInt, ageInt, marriageInt, prefInt, jobInt, hobbyInt, y]
dataLabel = [genderLabel, ageInt, marriageLabel, prefLabel, jobLabel, hobbyLabel, y]
users_int.append(dataInt)
users_label.append(dataLabel)
#Output to CSV file.
outputCsv('out_int.csv', users_int)
outputCsv('out_label.csv', users_label)
First, randomly generate customers, then The objective variable y (bought or not bought) depends on the value of each generated column. By branching with a conditional statement, the feature of "what kind of customer bought it?" Is set.
Function setFeatures
The features are set in. You can freely modify this function to give the dataset any tendency.
In this implementation example
It has the feature. Real-life datasets don't fall into this neat classification, Even if the above conditions are not met, if you add a process that returns yes at random, It is also possible to mix "customers who did not meet the conditions but bought".
out_int.csv
gender,age,marriage,pref,job,hobby,y
0,22,1,45,7,7,yes
1,20,1,4,3,6,no
0,64,1,18,6,7,no
0,44,0,29,1,4,no
0,69,0,18,2,5,no
0,49,1,20,7,1,no
0,40,1,41,7,4,no
out_label.csv
sex,age,marriage,Prefectures,Profession,hobby,y
Female,22,married,Miyazaki prefecture,Unemployed,Video game,yes
male,20,married,Miyagi Prefecture,self employed,motorcycle,no
Female,64,married,Fukui prefecture,housewife,Video game,no
Female,44,Unmarried,Nara Prefecture,employee,Outdoor,no
Female,69,Unmarried,Fukui prefecture,Company executive,Automobile,no
Female,49,married,Nagano Prefecture,Unemployed,reading,no
Female,40,married,Saga Prefecture,Unemployed,Outdoor,no
import pandas as pd
import numpy as np
import pydotplus
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier as DT
from IPython.display import Image
train = pd.read_csv("csv/out_label.csv",delimiter=",")
y = train["y"]
trainx = train.iloc[:,0:6]
trainxd = pd.get_dummies(trainx)
clf3 = DT(max_depth=20, min_samples_leaf=500)
clf3.fit(trainxd,y)
export_graphviz(clf3, out_file="tree_clf3.dot", feature_names=trainxd.columns, class_names=["0","1"], filled=True, rounded=True)
graph = pydotplus.graphviz.graph_from_dot_file('tree_clf3.dot')
Image(graph.create_png())
When checking the operation of the model, Know in advance the characteristics (answers) of the dataset to be loaded into the model It is easy to understand if you check "Did you classify according to the answer?" If you can classify it, you can judge that the implementation of the model is correct, If it cannot be classified, we will investigate the cause, such as whether the implementation is bad or the parameters are not appropriate.
Recommended Posts