What is Pandas

Pandas -wikipedia

pandas is a library that provides functions to support data analysis in the programming language Python. In particular, it provides data structures and operations for manipulating mathematical tables and time series data. Developer Wes McKinney wants a powerful and flexible tool for quantitative analysis of financial data, and started developing pandas at AQR Capital Management in 2008. Before leaving AQR, he persuaded his boss to open the library to the public.

Pandas-specific data format

--One-dimensional Series --Two-dimensional DataFrame --Row name (index) and column name (column) in DataFrame

Create a DataFrame

Create from numpy array

import numpy as np
import pandas as pd
d = np.array([[1,2,3],[4,5,6],[7,8,9]])
df = pd.DataFrame(d,columns=['a','b','c'])

>>> df
  a b c
0 1 2 3
1 4 5 6
2 7 8 9

The line name is df.index The column name is df.columns Can be examined at

Create a list type from a dictionary with values

df = pd.DataFrame({'a':[1,4,7],'b':[2,5,8]},'c':[3,6,9])

>>> df
  a b c
0 1 2 3
1 4 5 6
2 7 8 9

Read csv file

#The first row is treated as a column name
# index_cols=0 :Specify the column number of the column you want to use as an index starting from 0

import pandas as pd
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
print(df)
#          age state  point
# name                     
# Alice     24    NY     64
# Bob       42    CA     92
# Charlie   18    CA     70
# Dave      68    TX     70
# Ellen     24    CA     88
# Frank     30    NY     57

print(df.index.values)
# ['Alice' 'Bob' 'Charlie' 'Dave' 'Ellen' 'Frank']

print(df.columns.values)
# ['age' 'state' 'point']

Add "index_cols = 0" to csv with index

Without ** index_cols = 0 **, the index column will not be recognized.

df_header_index = pd.read_csv('data/src/sample_header_index.csv')
print(df_header_index)
#   Unnamed: 0   a   b   c   d
# 0        ONE  11  12  13  14
# 1        TWO  21  22  23  24
# 2      THREE  31  32  33  34
# ========================================================== #
df_header_index_col = pd.read_csv('data/src/sample_header_index.csv', index_col=0)
print(df_header_index_col)
#         a   b   c   d
# ONE    11  12  13  14
# TWO    21  22  23  24
# THREE  31  32  33  34

print(df_header_index_col.index)
# Index(['ONE', 'TWO', 'THREE'], dtype='object')

Add "header = None" to csv without header

In case of csv without header, if pd.read_csv is used as it is, the first line will be columns. If ** header = None **, serial numbers will be assigned to the column name columns. Alternatively, you can set the column names as ** names = ('A','B','C','D') **.

11,12,13,14
21,22,23,24
31,32,33,34

import pandas as pd

df = pd.read_csv('data/src/sample.csv')
print(df)
#    11  12  13  14
# 0  21  22  23  24
# 1  31  32  33  34

print(df.columns)
# Index(['11', '12', '13', '14'], dtype='object')
# ========================================================== #
df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)
#     0   1   2   3
# 0  11  12  13  14
# 1  21  22  23  24
# 2  31  32  33  34
# ========================================================== #
df_names = pd.read_csv('data/src/sample.csv', names=('A', 'B', 'C', 'D'))
print(df_names)
#     A   B   C   D
# 0  11  12  13  14
# 1  21  22  23  24
# 2  31  32  33  34

Use "usecols" when reading by specifying a column

df_none_usecols = pd.read_csv('data/src/sample.csv', header=None, usecols=[1, 3])
print(df_none_usecols)
#     1   3
# 0  12  14
# 1  22  24
# 2  32  34
# ========================================================== #
df_header_usecols = pd.read_csv('data/src/sample_header.csv',
                                usecols=lambda x: x is not 'b')
print(df_header_usecols)
#     a   c   d
# 0  11  13  14
# 1  21  23  24
# 2  31  33  34
# ========================================================== #
df_index_usecols = pd.read_csv('data/src/sample_header_index.csv',
                               index_col=0, usecols=[0, 1, 3])
print(df_index_usecols)
#         a   c
# ONE    11  13
# TWO    21  23
# THREE  31  33

Use "skiprows" if you want to skip lines

If you pass an integer to skiprows, ** skip the beginning of the file by the number of lines ** and read it.

df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)
#     0   1   2   3
# 0  11  12  13  14
# 1  21  22  23  24
# 2  31  32  33  34
# ========================================================== #
df_none = pd.read_csv('data/src/sample.csv', header=None, skiprows=2)
print(df_none)
#     0   1   2   3
# 0  31  32  33  34
# ========================================================== #
df_none_skiprows = pd.read_csv('data/src/sample.csv', header=None, skiprows=[1])
print(df_none_skiprows)
#     0   1   2   3
# 0  11  12  13  14
# 1  31  32  33  34
# ========================================================== #
#If you want to skip the last line, click "skip footer"=Set to "1"
# engine='python'Warning may occur if is not specified.
df_none_skipfooter = pd.read_csv('data/src/sample.csv', header=None,
                                 skipfooter=1, engine='python')
print(df_none_skipfooter)
#     0   1   2   3
# 0  11  12  13  14
# 1  21  22  23  24

Specify the number of lines you want to read with "nrows"

df_none_nrows = pd.read_csv('data/src/sample.csv', header=None, nrows=2)
print(df_none_nrows)
#     0   1   2   3
# 0  11  12  13  14
# 1  21  22  23  24

Use "dtype" to specify the column type

df_str_col = pd.read_csv('data/src/sample_header_index_dtype.csv',
                         index_col=0, dtype={'b': str, 'c': str})
print(df_str_col)
#        a    b    c  d
# ONE    1  001  100  x
# TWO    2  020  NaN  y
# THREE  3  300  300  z

print(df_str_col.dtypes)
# a     int64
# b    object
# c    object
# d    object
# dtype: object
# ========================================================== #
#DataFrame columns can be type converted with "as type"
print(df['s_i'].astype(int))
# 0      0
# 1     10
# 2    200
# Name: s_i, dtype: int64

Missing value

By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’. pandas.read_csv — pandas 0.23.0 documentation

"Na_values" when adding values to be treated as missing values

df_nan = pd.read_csv('data/src/sample_header_index_nan.csv', index_col=0)
print(df_nan)
#          a   b
# ONE    NaN NaN
# TWO      - NaN
# THREE  NaN NaN
# ========================================================== #
df_nan_set_na = pd.read_csv('data/src/sample_header_index_nan.csv',
                            index_col=0, na_values='-')
print(df_nan_set_na)
#         a   b
# ONE   NaN NaN
# TWO   NaN NaN
# THREE NaN NaN

If you want to decide the missing values yourself, use "na_values" and "keep_default_na = False" together.

,a,b
ONE,,NaN
TWO,-,nan
THREE,null,N/A
# ========================================================== #
df_nan_set_na_no_keep = pd.read_csv('data/src/sample_header_index_nan.csv',
                                    index_col=0, na_values=['-', 'NaN', 'null'],
                                    keep_default_na=False)
print(df_nan_set_na_no_keep)
#          a    b
# ONE         NaN
# TWO    NaN  nan
# THREE  NaN  N/A

Use "na_filter" to eliminate missing values

,a,b
ONE,,NaN
TWO,-,nan
THREE,null,N/A
# ========================================================== #
df_nan_no_filter = pd.read_csv('data/src/sample_header_index_nan.csv',
                               index_col=0, na_filter=False)
print(df_nan_no_filter)
#           a    b
# ONE          NaN
# TWO       -  nan
# THREE  null  N/A

Use "encoding ='shift_jis'" when the data contains Japanese

df_sjis = pd.read_csv('data/src/sample_header_shift_jis.csv',
                      encoding='shift_jis')
print(df_sjis)
#    a   b   c   d
#0 A 12 13 14
#1 22 23 24
#2 32 33 34

Read_csv, which can read anything, is really excellent ...

If the extension is .gz, .bz2, .zip, .xz, it will be automatically detected and expanded. If the extensions are different, explicitly specify the strings'gz','bz2','zip', and'xz' in the argument compression. In addition, it corresponds only when the csv file alone is compressed. An error will occur if multiple files are compressed. zip and xz are supported from version 0.18.1.

df_zip = pd.read_csv('data/src/sample_header.csv.zip')
df_web = pd.read_csv('http://www.post.japanpost.jp/zipcode/dl/oogaki/zip/13tokyo.zip',
                 header=None, encoding='shift_jis')

Make a new row

#horizontal direction(axis = 1)And make a column called Total
df['Total'] = df.sum(axis=1)

Summary statistics

at, iat: Select, get / change the value of a single element loc, iloc: Select, get / change the value of a single element or multiple elements

count: number of elements unique: Number of unique (unique) value elements top: Mode freq: Frequency of mode (number of occurrences) mean: arithmetic mean std: standard deviation min: minimum value max: maximum value 50%: Median 25%, 75%: 1/4 quartile, 3/4 quartile

import pandas as pd

df = pd.DataFrame({'a': [1, 2, 1, 3],
                   'b': [0.4, 1.1, 0.1, 0.8],
                   'c': ['X', 'Y', 'X', 'Z'],
                   'd': ['3', '5', '2', '1'],
                   'e': [True, True, False, True]})

print(df)
#    a    b  c  d      e
# 0  1  0.4  X  3   True
# 1  2  1.1  Y  5   True
# 2  1  0.1  X  2  False
# 3  3  0.8  Z  1   True

print(df.dtypes)
# a      int64
# b    float64
# c     object
# d     object
# e       bool
# dtype: object
# ========================================================== #
##View summary statistics for data frame df
#Mean, standard deviation, maximum, minimum, mode, etc. for each column

print(df.describe())
#               a         b
# count  4.000000  4.000000
# mean   1.750000  0.600000
# std    0.957427  0.439697
# min    1.000000  0.100000
# 25%    1.000000  0.325000
# 50%    1.500000  0.600000
# 75%    2.250000  0.875000
# max    3.000000  1.100000

print(type(df.describe()))
# <class 'pandas.core.frame.DataFrame'>

print(df.describe().loc['std']) #std is the standard deviation
# a    0.957427
# b    0.439697
# Name: std, dtype: float64

print(df.describe().at['std', 'b'])
# 0.439696865275764

Use "exclude ='number'" to display the results of non-numeric columns such as strings

** If include ='all', all types of columns will be included. ** **

print(df.describe(exclude='number'))
#         c  d     e
# count   4  4     4
# unique  3  4     2
# top     X  3  True
# freq    2  1     3

Data plot

df.plot() #Line graph
df.plot.bar(stacked=True) #Stacked bar graph
df.plot.scatter(‘Japanese’,’English’) #Scatter plot by specifying columns
df[‘Japanese’].plot.hist() #Histogram with columns