transform.py
import re
import pandas as pd
import numpy as np
import codecs
import matplotlib.pyplot as plt
from sklearn import linear_model
with codecs.open(<Car data>, "r", "Shift-JIS", "ignore") as file: #http://qiita.com/niwaringo/items/d2a30e04e08da8eaa643
df = pd.read_table(file, delimiter=",")
dfx = df.copy()
dfx.iloc[:,5:] = dfx.iloc[:,5:].applymap(lambda x: re.sub(',','',x)).applymap(lambda x: re.sub('-','',x)).convert_objects(convert_numeric=True)
#FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
with codecs.open('Resident area.csv', "r", "Shift-JIS", "ignore") as file:
df2 = pd.read_table(file, delimiter=",")
df2.iloc[:,3:] = df2.iloc[:,3:].applymap(lambda x: re.sub(',','',x)).applymap(lambda x: re.sub('-','',x)).convert_objects(convert_numeric=True)
nan = float('nan')
static_data=pd.DataFrame({
'population_signal':[nan],
'area_signal':[nan],
'Prefectures':[nan],
'Automobile_signal':[nan],
'Resident area_signal':[nan],
'Composite function_population_area':[nan],
'Composite function_Automobile_area':[nan],
'Composite function_population_Resident area':[nan],
'Composite function_Automobile_Resident area':[nan]
}).dropna()
for iteritem in range(1,48):
iter_shape = df2[df2['Prefecture code']==iteritem]
iter_data = dfx[
(dfx['Prefecture code']==iteritem)
& (dfx['Municipality'].str.contains('city$') | dfx['Municipality'].str.contains('Total$') | dfx['Municipality'].str.contains('town$') | dfx['Municipality'].str.contains('village$'))
& (dfx['Business type'].str.contains('^Total$'))
]
iter_data2 = iter_data.copy()
iter_data2.loc[:,'Municipality'] = iter_data2.loc[:,'Municipality'].apply(lambda x: re.sub(r'City total$','city',x))
iter_data2.loc[:,'Municipality'] = iter_data2.loc[:,'Municipality'].apply(lambda x: re.sub(r'^.*county','',x))
merged = pd.merge(iter_shape,iter_data2,on='Municipality')
merged = merged.assign(
Composite function_population=np.nan,
Composite function_Automobile_area=(merged['Total total']**(2/3))*(merged['総area']**(1/3)),
Composite function_population_area=(merged['population総数']**(2/3))*(merged['総area']**(1/3)),
Composite function_Automobile_Resident area=(merged['Total total']**(2/3))*(merged['Resident area']**(1/3)),
Composite function_population_Resident area=(merged['population総数']**(2/3))*(merged['Resident area']**(1/3)),
Area square root=np.sqrt(merged['Total area']),
Resident area square root=np.sqrt(merged['Resident area']),
Signal estimation= np.around(0.0027*merged['Total total'].astype(np.float),0)
)
people_signal = linear_model.LinearRegression(fit_intercept=False)
car_signal = linear_model.LinearRegression(fit_intercept=False)
shape_signal = linear_model.LinearRegression(fit_intercept=False)
liveshape_signal = linear_model.LinearRegression(fit_intercept=False)
people_shape = linear_model.LinearRegression(fit_intercept=False)
car_shape = linear_model.LinearRegression(fit_intercept=False)
people_liveshape = linear_model.LinearRegression(fit_intercept=False)
car_liveshape = linear_model.LinearRegression(fit_intercept=False)
people = np.array(merged['Total area']).reshape(-1,1)
car = np.array(merged['Total total']).reshape(-1,1)
shape = np.array(merged['Area square root']).reshape(-1,1)
liveshape = np.array(merged['Resident area square root']).reshape(-1,1)
peopleShape = np.array(merged['Composite function_population_area']).reshape(-1,1)
carShape = np.array(merged['Composite function_Automobile_area']).reshape(-1,1)
peopleLiveShape = np.array(merged['Composite function_population_Resident area']).reshape(-1,1)
carLiveShape = np.array(merged['Composite function_Automobile_Resident area']).reshape(-1,1)
y_data=np.array(merged['Signal estimation']).reshape(-1,1)
people_signal.fit(people,y_data)
car_signal.fit(car,y_data)
shape_signal.fit(shape,y_data)
liveshape_signal.fit(liveshape,y_data)
people_shape.fit(peopleShape,y_data)
car_shape.fit(carShape,y_data)
people_liveshape.fit(peopleLiveShape,y_data)
car_liveshape.fit(carLiveShape,y_data)
df_result=pd.DataFrame({
'population_signal':[people_signal.score(people,y_data)],
'area_signal':[shape_signal.score(shape,y_data)],
'Prefectures':[merged['Prefectures'][0]],
'Automobile_signal':[car_signal.score(car,y_data)],
'Resident area_signal':[liveshape_signal.score(liveshape,y_data)],
'Composite function_population_area':[people_shape.score(peopleShape,y_data)],
'Composite function_Automobile_area':[car_shape.score(carShape,y_data)],
'Composite function_population_Resident area':[people_liveshape.score(peopleLiveShape,y_data)],
'Composite function_Automobile_Resident area':[car_liveshape.score(carLiveShape,y_data)]
})
static_data = static_data.append(df_result)
static_data.to_csv('static_data.csv',encoding='shift_jis')
with codecs.open(<Car data>, "r", "Shift-JIS", "ignore") as file: #http://qiita.com/niwaringo/items/d2a30e04e08da8eaa643
df = pd.read_table(file, delimiter=",")
hokkaido_data = df[(df['Regional Transport Bureau'].str.contains('North Sea'))
& (df['Municipality'].str.contains('city$') | df['Municipality'].str.contains('Total$') | df['Municipality'].str.contains('town$') | df['Municipality'].str.contains('village$'))
& (df['Business type'].str.contains('Total')) ]
#Local Transport Bureau includes North Sea&& (City name ends with city|City name includes county=>City name ends with town, village|Ends in town, village) &&Business type is total
#End-of-line match
Source: http://sinhrks.hatenablog.com/entry/2014/12/06/233032
So, by adding the str
accessor, batch processing of character strings becomes possible.
df[(df['Regional Transport Bureau'].str.contains('^North Sea'))].Regional Transport Bureau.str.replace(r'^North Sea道', '')
#All of"Hokkaido District Transport Bureau"To"Transport Bureau"Replace with
Converts a comma-separated string every 3 digits to a number
dfx.iloc[:,4:] = # iloc()Since the one extracted with is not a slice, the value can be changed directly.
dfx.iloc[:,4:].applymap(lambda x: re.sub(',','',x))
.applymap(lambda x: re.sub('-','',x))
.convert_objects(convert_numeric=True)
・ Since a copy is created in ʻapplymap, substitute it. -ʻApplymap
applies the argument function to all elements.
-Replace the first argument of re.sub (pattern, str, argument)
=> ʻargument` with the second argument
・ Lamb (x: x ** 2) => Anonymous function
from sklearn import linear_model
lm = linear_model.LinearRegression(fit_intercept=False) #Option to set the intercept to 0
x_data=merged['Resident area']
y_data=merged['Total population']
#Since sklearn can only handle numpy, change it to numpy type. Also, reshape it because it has to be in the form of columns instead of rows.
x_data = np.array(x_data).reshape(-1,1)
y_data = np.array(y_data).reshape(-1,1)
lm.fit(x_data,y_data)
print(lm.coef_) #Partial regression coefficient
print(lm.intercept_) #=> 0.0 Naturally, the intercept is set to 0
print(lm.score(x_data,y_data)) #Coefficient of determination
http://qiita.com/irs/items/4ebbfd8bef63db1892fb http://qiita.com/Attsun/items/5af3efdc241aa2fd3959 http://sinhrks.hatenablog.com/entry/2015/01/28/073327 About anonymous function lambda http://www.lifewithpython.com/2013/01/python-anonymous-function-lambda.html
Recommended Posts