pandas is a library that provides functions to support data analysis in the programming language Python. In particular, it provides data structures and operations for manipulating mathematical tables and time series data. Developer Wes McKinney wants a powerful and flexible tool for quantitative analysis of financial data, and started developing pandas at AQR Capital Management in 2008. Before leaving AQR, he persuaded his boss to open the library to the public.
--One-dimensional Series --Two-dimensional DataFrame --Row name (index) and column name (column) in DataFrame
import numpy as np
import pandas as pd
d = np.array([[1,2,3],[4,5,6],[7,8,9]])
df = pd.DataFrame(d,columns=['a','b','c'])
>>> df
a b c
0 1 2 3
1 4 5 6
2 7 8 9
The line name is df.index The column name is df.columns Can be examined at
df = pd.DataFrame({'a':[1,4,7],'b':[2,5,8]},'c':[3,6,9])
>>> df
a b c
0 1 2 3
1 4 5 6
2 7 8 9
#The first row is treated as a column name
# index_cols=0 :Specify the column number of the column you want to use as an index starting from 0
import pandas as pd
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
print(df)
# age state point
# name
# Alice 24 NY 64
# Bob 42 CA 92
# Charlie 18 CA 70
# Dave 68 TX 70
# Ellen 24 CA 88
# Frank 30 NY 57
print(df.index.values)
# ['Alice' 'Bob' 'Charlie' 'Dave' 'Ellen' 'Frank']
print(df.columns.values)
# ['age' 'state' 'point']
Without ** index_cols = 0 **, the index column will not be recognized.
df_header_index = pd.read_csv('data/src/sample_header_index.csv')
print(df_header_index)
# Unnamed: 0 a b c d
# 0 ONE 11 12 13 14
# 1 TWO 21 22 23 24
# 2 THREE 31 32 33 34
# ========================================================== #
df_header_index_col = pd.read_csv('data/src/sample_header_index.csv', index_col=0)
print(df_header_index_col)
# a b c d
# ONE 11 12 13 14
# TWO 21 22 23 24
# THREE 31 32 33 34
print(df_header_index_col.index)
# Index(['ONE', 'TWO', 'THREE'], dtype='object')
In case of csv without header, if pd.read_csv is used as it is, the first line will be columns. If ** header = None **, serial numbers will be assigned to the column name columns. Alternatively, you can set the column names as ** names = ('A','B','C','D') **.
11,12,13,14
21,22,23,24
31,32,33,34
import pandas as pd
df = pd.read_csv('data/src/sample.csv')
print(df)
# 11 12 13 14
# 0 21 22 23 24
# 1 31 32 33 34
print(df.columns)
# Index(['11', '12', '13', '14'], dtype='object')
# ========================================================== #
df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)
# 0 1 2 3
# 0 11 12 13 14
# 1 21 22 23 24
# 2 31 32 33 34
# ========================================================== #
df_names = pd.read_csv('data/src/sample.csv', names=('A', 'B', 'C', 'D'))
print(df_names)
# A B C D
# 0 11 12 13 14
# 1 21 22 23 24
# 2 31 32 33 34
df_none_usecols = pd.read_csv('data/src/sample.csv', header=None, usecols=[1, 3])
print(df_none_usecols)
# 1 3
# 0 12 14
# 1 22 24
# 2 32 34
# ========================================================== #
df_header_usecols = pd.read_csv('data/src/sample_header.csv',
usecols=lambda x: x is not 'b')
print(df_header_usecols)
# a c d
# 0 11 13 14
# 1 21 23 24
# 2 31 33 34
# ========================================================== #
df_index_usecols = pd.read_csv('data/src/sample_header_index.csv',
index_col=0, usecols=[0, 1, 3])
print(df_index_usecols)
# a c
# ONE 11 13
# TWO 21 23
# THREE 31 33
If you pass an integer to skiprows, ** skip the beginning of the file by the number of lines ** and read it.
df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)
# 0 1 2 3
# 0 11 12 13 14
# 1 21 22 23 24
# 2 31 32 33 34
# ========================================================== #
df_none = pd.read_csv('data/src/sample.csv', header=None, skiprows=2)
print(df_none)
# 0 1 2 3
# 0 31 32 33 34
# ========================================================== #
df_none_skiprows = pd.read_csv('data/src/sample.csv', header=None, skiprows=[1])
print(df_none_skiprows)
# 0 1 2 3
# 0 11 12 13 14
# 1 31 32 33 34
# ========================================================== #
#If you want to skip the last line, click "skip footer"=Set to "1"
# engine='python'Warning may occur if is not specified.
df_none_skipfooter = pd.read_csv('data/src/sample.csv', header=None,
skipfooter=1, engine='python')
print(df_none_skipfooter)
# 0 1 2 3
# 0 11 12 13 14
# 1 21 22 23 24
df_none_nrows = pd.read_csv('data/src/sample.csv', header=None, nrows=2)
print(df_none_nrows)
# 0 1 2 3
# 0 11 12 13 14
# 1 21 22 23 24
df_str_col = pd.read_csv('data/src/sample_header_index_dtype.csv',
index_col=0, dtype={'b': str, 'c': str})
print(df_str_col)
# a b c d
# ONE 1 001 100 x
# TWO 2 020 NaN y
# THREE 3 300 300 z
print(df_str_col.dtypes)
# a int64
# b object
# c object
# d object
# dtype: object
# ========================================================== #
#DataFrame columns can be type converted with "as type"
print(df['s_i'].astype(int))
# 0 0
# 1 10
# 2 200
# Name: s_i, dtype: int64
By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’. pandas.read_csv — pandas 0.23.0 documentation
df_nan = pd.read_csv('data/src/sample_header_index_nan.csv', index_col=0)
print(df_nan)
# a b
# ONE NaN NaN
# TWO - NaN
# THREE NaN NaN
# ========================================================== #
df_nan_set_na = pd.read_csv('data/src/sample_header_index_nan.csv',
index_col=0, na_values='-')
print(df_nan_set_na)
# a b
# ONE NaN NaN
# TWO NaN NaN
# THREE NaN NaN
,a,b
ONE,,NaN
TWO,-,nan
THREE,null,N/A
# ========================================================== #
df_nan_set_na_no_keep = pd.read_csv('data/src/sample_header_index_nan.csv',
index_col=0, na_values=['-', 'NaN', 'null'],
keep_default_na=False)
print(df_nan_set_na_no_keep)
# a b
# ONE NaN
# TWO NaN nan
# THREE NaN N/A
,a,b
ONE,,NaN
TWO,-,nan
THREE,null,N/A
# ========================================================== #
df_nan_no_filter = pd.read_csv('data/src/sample_header_index_nan.csv',
index_col=0, na_filter=False)
print(df_nan_no_filter)
# a b
# ONE NaN
# TWO - nan
# THREE null N/A
df_sjis = pd.read_csv('data/src/sample_header_shift_jis.csv',
encoding='shift_jis')
print(df_sjis)
# a b c d
#0 A 12 13 14
#1 22 23 24
#2 32 33 34
If the extension is .gz, .bz2, .zip, .xz, it will be automatically detected and expanded. If the extensions are different, explicitly specify the strings'gz','bz2','zip', and'xz' in the argument compression. In addition, it corresponds only when the csv file alone is compressed. An error will occur if multiple files are compressed. zip and xz are supported from version 0.18.1.
df_zip = pd.read_csv('data/src/sample_header.csv.zip')
df_web = pd.read_csv('http://www.post.japanpost.jp/zipcode/dl/oogaki/zip/13tokyo.zip',
header=None, encoding='shift_jis')
#horizontal direction(axis = 1)And make a column called Total
df['Total'] = df.sum(axis=1)
at, iat: Select, get / change the value of a single element loc, iloc: Select, get / change the value of a single element or multiple elements
count: number of elements unique: Number of unique (unique) value elements top: Mode freq: Frequency of mode (number of occurrences) mean: arithmetic mean std: standard deviation min: minimum value max: maximum value 50%: Median 25%, 75%: 1/4 quartile, 3/4 quartile
import pandas as pd
df = pd.DataFrame({'a': [1, 2, 1, 3],
'b': [0.4, 1.1, 0.1, 0.8],
'c': ['X', 'Y', 'X', 'Z'],
'd': ['3', '5', '2', '1'],
'e': [True, True, False, True]})
print(df)
# a b c d e
# 0 1 0.4 X 3 True
# 1 2 1.1 Y 5 True
# 2 1 0.1 X 2 False
# 3 3 0.8 Z 1 True
print(df.dtypes)
# a int64
# b float64
# c object
# d object
# e bool
# dtype: object
# ========================================================== #
##View summary statistics for data frame df
#Mean, standard deviation, maximum, minimum, mode, etc. for each column
print(df.describe())
# a b
# count 4.000000 4.000000
# mean 1.750000 0.600000
# std 0.957427 0.439697
# min 1.000000 0.100000
# 25% 1.000000 0.325000
# 50% 1.500000 0.600000
# 75% 2.250000 0.875000
# max 3.000000 1.100000
print(type(df.describe()))
# <class 'pandas.core.frame.DataFrame'>
print(df.describe().loc['std']) #std is the standard deviation
# a 0.957427
# b 0.439697
# Name: std, dtype: float64
print(df.describe().at['std', 'b'])
# 0.439696865275764
** If include ='all', all types of columns will be included. ** **
print(df.describe(exclude='number'))
# c d e
# count 4 4 4
# unique 3 4 2
# top X 3 True
# freq 2 1 3
df.plot() #Line graph
df.plot.bar(stacked=True) #Stacked bar graph
df.plot.scatter(‘Japanese’,’English’) #Scatter plot by specifying columns
df[‘Japanese’].plot.hist() #Histogram with columns
Recommended Posts