from __future__ import division, unicode_literals
#Count for each element of the list
from collections import defaultdict
cnt_dict = defaultdict(int)
data = x = np.random.randint(low=0, high=5, size=500)
for d in data:
cnt_dict[d] += 1
print cnt_dict
out
defaultdict(<type 'int'>, {0: 90, 1: 113, 2: 94, 3: 96, 4: 107})
#Library version check
from distutils.version import LooseVersion
#Example of use
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
#More convenient counting method
import numpy as np
from collections import Counter
data1 = np.random.randint(low=0, high=5, size=300)
cnt1 = Counter(data1)
print cnt1
data2 = np.random.randint(low=0, high=10, size=500)
cnt2 = Counter(data2)
print cnt2
print cnt1 + cnt2
out
Counter({3: 65, 0: 64, 1: 60, 4: 60, 2: 51})
Counter({4: 58, 8: 58, 1: 55, 6: 54, 0: 53, 2: 49, 3: 47, 5: 46, 7: 40, 9: 40})
Counter({4: 118, 0: 117, 1: 115, 3: 112, 2: 100, 8: 58, 6: 54, 5: 46, 7: 40, 9: 40})
#Pickle
import cPickle as pickle
def unpickle(filename):
with open(filename, 'rb') as fo:
_dict = pickle.load(fo)
return _dict
def to_pickle(filename, obj):
with open(filename, 'wb') as f:
pickle.dump(obj, f, -1)
# pickle.Pickler(f, 2).dump(obj)
#Extracting links from web pages
from bs4 import BeautifulSoup
import requests
from requests_oauthlib import OAuth1Session
url = 'http://headlines.yahoo.co.jp/rss/list'
url_list = []
res = requests.get(url)
news_all = BeautifulSoup(res.text, "xml")
for link in news_all.find_all('a'):
url = link.get('href')
print url
save with h5&load
import deepdish as dd
dd.io.save("../data/df_test.h5", df_test)
df_test = dd.io.load("../data/df_test.h5")
#How many decimal places to display
%precision 4
np.pi
out
3.1416
#Fixed import statement
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import diatomite as dt
import sys
plt.style.use('ggplot')
#When using Tex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
Pandas Dataframe
Convert a string that looks like a number to a numeric type
# http://stackoverflow.com/questions/21197774/assign-pandas-dataframe-column-dtypes
In [11]: df
Out[11]:
x y
0 a 1
1 b 2
In [12]: df.dtypes
Out[12]:
x object
y object
dtype: object
In [13]: df.convert_objects(convert_numeric=True)
Out[13]:
x y
0 a 1
1 b 2
In [14]: df.convert_objects(convert_numeric=True).dtypes
Out[14]:
x object
y int64
dtype: object
#Category variable(Factory type in R)Treatment
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
df = sns.load_dataset("tips")
for c in ['sex', 'smoker', 'day', 'time',]:
df["c{}".format(c)] = pd.Categorical.from_array(df[c]).codes
df.head()
out
total_bill tip sex smoker day time size csex csmoker cday ctime
0 16.99 1.01 Female No Sun Dinner 2 0 0 2 0
1 10.34 1.66 Male No Sun Dinner 3 1 0 2 0
2 21.01 3.50 Male No Sun Dinner 3 1 0 2 0
3 23.68 3.31 Male No Sun Dinner 2 1 0 2 0
4 24.59 3.61 Female No Sun Dinner 4 0 0 2 0
from datetime import datetime
now = datetime.now()
now.strftime("%Y-%m-%d %a %H:%M:%S")
out
'2015-08-13 Thu 16:41:25'
from dateutil.parser import parse
parse("2015-3-25 21:43:15")
out
datetime.datetime(2015, 3, 25, 21, 43, 15)
datestrs = ['2011/7/6 12:00:00', None, '2011/8/6 21:00:00']
pd.to_datetime(datestrs)
out
DatetimeIndex(['2011-07-06 12:00:00', 'NaT', '2011-08-06 21:00:00'], dtype='datetime64[ns]', freq=None, tz=None)
#Date duplication check
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/2', '2000/1/2', '2000/1/2','2000/1/3'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts.index.is_unique
out
False
dup_ts.groupby(level=0).count()
out
2000-01-01 1
2000-01-02 3
2000-01-03 1
#Date data generation by specifying a range
dft = pd.date_range(start='2000-1-1', end='2001-1-1', freq='H')
dft
out
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00',
'2000-01-01 02:00:00', '2000-01-01 03:00:00',
'2000-01-01 04:00:00', '2000-01-01 05:00:00',
'2000-01-01 06:00:00', '2000-01-01 07:00:00',
'2000-01-01 08:00:00', '2000-01-01 09:00:00',
...
'2000-12-31 15:00:00', '2000-12-31 16:00:00',
'2000-12-31 17:00:00', '2000-12-31 18:00:00',
'2000-12-31 19:00:00', '2000-12-31 20:00:00',
'2000-12-31 21:00:00', '2000-12-31 22:00:00',
'2000-12-31 23:00:00', '2001-01-01 00:00:00'],
dtype='datetime64[ns]', length=8785, freq='H', tz=None)
#Fill in the missing dates(resample function)
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/5', '2000/1/8', '2000/1/9'])
df = pd.DataFrame(np.random.normal(0,1,size=len(dates)), columns=["num"], index=dates)
print "[Before]"
print df
df = df.resample('D')
print "[After]"
print df
out
[Before]
num
2000-01-01 1.201939
2000-01-05 0.522156
2000-01-08 1.800669
2000-01-09 -0.834700
[After]
num
2000-01-01 1.201939
2000-01-02 NaN
2000-01-03 NaN
2000-01-04 NaN
2000-01-05 0.522156
2000-01-06 NaN
2000-01-07 NaN
2000-01-08 1.800669
2000-01-09 -0.834700
#Localization
dates = pd.DatetimeIndex(['2000/1/1', '2000/1/5', '2000/1/8', '2000/1/9'])
print dates.tz.__repr__
print dates
#Set location in Japan time (ex:00:00:00 is recognized as the time in Japan time)
dates = dates.tz_localize("Japan")
print dates
#Converted to US East Coast time (value does not change)
print dates.tz_convert('US/Eastern')
out
<method-wrapper '__repr__' of NoneType object at 0x10017dc40>
DatetimeIndex(['2000-01-01', '2000-01-05', '2000-01-08', '2000-01-09'], dtype='datetime64[ns]', freq=None, tz=None)
DatetimeIndex(['2000-01-01 00:00:00+09:00', '2000-01-05 00:00:00+09:00',
'2000-01-08 00:00:00+09:00', '2000-01-09 00:00:00+09:00'],
dtype='datetime64[ns]', freq=None, tz='Japan')
DatetimeIndex(['1999-12-31 10:00:00-05:00', '2000-01-04 10:00:00-05:00',
'2000-01-07 10:00:00-05:00', '2000-01-08 10:00:00-05:00'],
dtype='datetime64[ns]', freq=None, tz='US/Eastern')
rng = pd.period_range('2014/1/1', '2015/3/31', freq='M');
print rng
ser = pd.Series(np.random.randn(rng.size), index=rng)
print ser
values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-DEC')
df = pd.DataFrame(np.random.randn(index.size), index=index)
print df
out
PeriodIndex(['2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06',
'2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12',
'2015-01', '2015-02', '2015-03'],
dtype='int64', freq='M')
2014-01 0.273280
2014-02 -0.231141
2014-03 0.251094
2014-04 -1.217927
2014-05 0.341373
2014-06 -0.931357
2014-07 -0.414243
2014-08 -1.876341
2014-09 1.152908
2014-10 -0.473921
2014-11 0.527473
2014-12 -0.529911
2015-01 -0.656616
2015-02 0.742319
2015-03 -0.268112
Freq: M, dtype: float64
0
2014Q3 0.011621
2014Q4 -0.029027
2015Q1 -0.222156
2015Q2 -0.749983
#When CY is applied
values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-DEC')
print index
print index.asfreq('M',how='start')
print index.asfreq('M',how='end')
print index.asfreq('D',how='start')
print index.asfreq('D',how='end')
out
PeriodIndex(['2014Q3', '2014Q4', '2015Q1', '2015Q2'], dtype='int64', freq='Q-DEC')
PeriodIndex(['2014-07', '2014-10', '2015-01', '2015-04'], dtype='int64', freq='M')
PeriodIndex(['2014-09', '2014-12', '2015-03', '2015-06'], dtype='int64', freq='M')
PeriodIndex(['2014-07-01', '2014-10-01', '2015-01-01', '2015-04-01'], dtype='int64', freq='D')
PeriodIndex(['2014-09-30', '2014-12-31', '2015-03-31', '2015-06-30'], dtype='int64', freq='D')
#When applying FY
values = ['2014Q3','2014Q4','2015Q1', '2015Q2']
index = pd.PeriodIndex(values, freq='Q-MAR')
print index
print index.asfreq('M',how='start')
print index.asfreq('M',how='end')
print index.asfreq('D',how='start')
print index.asfreq('D',how='end')
out
PeriodIndex(['2014Q3', '2014Q4', '2015Q1', '2015Q2'], dtype='int64', freq='Q-MAR')
PeriodIndex(['2013-10', '2014-01', '2014-04', '2014-07'], dtype='int64', freq='M')
PeriodIndex(['2013-12', '2014-03', '2014-06', '2014-09'], dtype='int64', freq='M')
PeriodIndex(['2013-10-01', '2014-01-01', '2014-04-01', '2014-07-01'], dtype='int64', freq='D')
PeriodIndex(['2013-12-31', '2014-03-31', '2014-06-30', '2014-09-30'], dtype='int64', freq='D')
# time zone change from utc to stdjp (for no timezone variable)
import datetime, pytz
utc = pytz.timezone('UTC')
jst = pytz.timezone('Asia/Tokyo')
now = datetime.datetime.now()
updated = now.replace(tzinfo=utc).astimezone(jst)
print "time:{}".format(updated)
out
time:2015-08-22 02:46:23.844806+09:00
out
out
ts = pd.Series(np.random.randn(1000), index=pd.date_range('2010/1/1', periods=1000))
ts = ts.cumsum()
ts.plot(color="b", alpha=0.5, figsize=(10,6))
#Simple moving average
pd.rolling_mean(ts, 40, center=True).plot(style='-', c='r', alpha=0.8,)
pd.rolling_mean(ts, 180, center=True).plot(style='-', c='blue', alpha=0.9,zorder=100)
#You can slice by date!
ts['2010/12/31':]
out
#Correlogram drawing
import statsmodels.tsa.stattools as stt
plt.figure(figsize=(10,5))
acf = stt.acf(np.array(ts), 60) #ACF calculation
plt.bar(range(len(acf)), acf, width = 0.3) #display
plt.show()
pcf = stt.pacf(np.array(ts), 50)
plt.figure(figsize=(10,5))
plt.bar(range(len(pcf)), pcf, width = 0.3)
plt.show()
# ARMA(3, 0)Process sample generation
from statsmodels.tsa.arima_process import arma_generate_sample
ar_params = np.array([0.30, 0.50, -0.10])
ma_params = np.array([0.00])
ar_params = np.r_[1, -ar_params]
ma_params = np.r_[1, -ma_params]
nobs = 250
y = arma_generate_sample(ar_params, ma_params, nobs)
ts = pd.Series(y, index=pd.date_range('2010/1/1', periods=nobs))
ts.plot(color="b", alpha=0.5, figsize=(10,6))
plt.figure(figsize=(10,5))
acf = stt.acf(np.array(ts), 60) #ACF calculation
ts_acf = pd.Series(acf, index=pd.date_range('2010/1/1', periods=len(acf)))
ts_acf.plot(kind='bar', figsize=(10,5), color="b", alpha=0.5)
plt.show()
pacf = stt.pacf(np.array(ts), 50)
ts_pacf = pd.Series(pacf, index=pd.date_range('2010/1/1', periods=len(pacf)))
ts_pacf.plot(kind='bar', figsize=(10,5), color="g", alpha=0.5)
plt.show()
import statsmodels.graphics.tsaplots as tsaplots
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(111)
tsaplots.plot_acf(ts, ax=ax, color="g")
plt.show()
#ARMA test
from statsmodels.tsa import arima_model
arma = arima_model.ARMA(y, order = [3,0]).fit()
print arma.summary()
out
ARMA Model Results
==============================================================================
Dep. Variable: y No. Observations: 250
Model: ARMA(3, 0) Log Likelihood -357.274
Method: css-mle S.D. of innovations 1.009
Date: Thu, 13 Aug 2015 AIC 724.548
Time: 17:57:45 BIC 742.155
Sample: 0 HQIC 731.634
==============================================================================
coef std err z P>|z| [95.0% Conf. Int.]
------------------------------------------------------------------------------
const 0.0262 0.187 0.140 0.889 -0.341 0.393
ar.L1.y 0.2256 0.063 3.586 0.000 0.102 0.349
ar.L2.y 0.4945 0.057 8.699 0.000 0.383 0.606
ar.L3.y -0.0569 0.064 -0.895 0.371 -0.181 0.068
Roots
=============================================================================
Real Imaginary Modulus Frequency
-----------------------------------------------------------------------------
AR.1 1.2968 +0.0000j 1.2968 0.0000
AR.2 -1.5205 +0.0000j 1.5205 0.5000
AR.3 8.9145 +0.0000j 8.9145 0.0000
-----------------------------------------------------------------------------
#Confirmation of ARMA residuals
resid = arma.resid
plt.figure(figsize=(15,5))
plt.bar(range(len(resid)), resid, width=0.5)
plt.show()
plt.figure(figsize=(15,5))
acf = stt.acf(resid, nlags=len(resid))
plt.bar(range(len(acf)), acf, width=0.5, color="g")
plt.show()
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
tsaplots.plot_acf(resid, ax=ax, color="pink")
plt.show()
# Ljung-Box Q-statistic for autocorrelation parameters
lbs = stt.q_stat(acf, len(ts)) #stats models is a specification that takes acf as input
plt.figure(figsize=(12,6))
plt.bar(range(len(lbs[1])), lbs[1])
#Fill in missing values with 0
df_data.fillna(0)
out
out
Spark
import os, sys
from datetime import datetime as dt
print "loading PySpark setting..."
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
out
loading PySpark setting...
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 1.5.0
/_/
Using Python version 2.7.10 (default, May 28 2015 17:04:42)
SparkContext available as sc, HiveContext available as sqlContext.
#Split data for Cross Validation
from pyspark.mllib.regression import LabeledPoint
def parsePoint(vec):
return LabeledPoint(vec[0], vec[1:])
dat = np.column_stack([iris.target[:], iris.data[:,0],iris.data[:,2]])
data = sc.parallelize(dat) #RDD conversion
parsedData = data.map(parsePoint) #Convert the contents data to Labeled Point
#Divided into training data and test data
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
out
Recommended Posts