"""Get data from MySQL with pandas library."""
import MySQLdb
import pandas.io.sql as psql
con = MySQLdb.connect(db='work', user='root', passwd='') #DB connection
sql = """SELECT product_id, product_nm, product_features FROM electronics"""
df = psql.read_sql(sql, con) #Extract data in the form of pandas DataFrame
con.close()
When creating a vector for clustering etc. using large-scale data, iterative processing is performed while deleting the data in order to reduce memory consumption.
"""Delete rows while creating dataset."""
X = []
for index, row in df.iterrows(): #Iterate line by line
Xi = [row.col1, row.col2, row.col3]
X.append(X)
df = df.ix[index:] #Create vector while deleting data to reduce memory consumption
The first method cleans the code, but has the drawback of slow iterations. It's many times faster to list once.
"""High speed row iteration in pandas DataFrame"""
#Copy the data to the list
df_index, df_col1, df_col2, df_col3 = \
list(df.index), list(df.col1), list(df.col2), list(df.col3)
del df #Delete data
for _ in df_index:
#Iterate while deleting data
col1, col2, col3 = df_col1.pop(), df_col2.pop(), df_col3.pop()
Xi = [col1, col2, col3]
X.append(Xi)
Recommended Posts