Google Analytics is a Wed access analysis tool provided by google. You can get various information such as the access area, time, device used, browser, etc. of the visitor to the Wed site. Since the GUI is also substantial, it is enough to say that it is enough, but since the API is also provided for free, I tried to make an API connection and create a database.
Enable the API from the Google API console.
sudo pip install --upgrade google-api-python-client
Enter the date and the information you want to get in the main function and return the DataFrame.
'''fetch_analytics_data.py'''
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = '<REPLACE_WITH_JSON_FILE>'
VIEW_ID = '<ID_NUMBER>'
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics, start_day, end_day, dimension_list):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': start_day, 'endDate': end_day}],
# 'metrics': [{'expression': 'ga:sessions'}],
'dimensions': dimension_list
# [{'name': 'ga:country'},
# {'name': 'ga:browser'},
# {'name': 'ga:city'},
# {'name': 'ga:sex'}
# ]
}]
}
).execute()
def fetch_columns_name(response):
col_name = [i.lstrip('ga:') for i in list(list(list(response.values())[0][0].values())[0].values())[0]]
# col_name.append('visitor')
return col_name
def split_dimensions(response, col_name):
df = pd.json_normalize(response.get('reports'), sep=' ')["data rows"]
df = pd.json_normalize(df[0])
# split dimensions
for i in range(len(df['dimensions'][0])):
new_feature = [df['dimensions'][j][i] for j in range(df.shape[0])]
feature_name = col_name[i]
df[feature_name] = new_feature
df.drop(["dimensions"], axis=1, inplace=True)
# fetch visitor (not use now)
# df["visitor"] = [int(list(df['metrics'][i][0].values())[0][0]) for i in range(df.shape[0])]
df.drop(['metrics'], axis=1, inplace=True)
return df
def main(start_day, end_day, dim_list):
analytics = initialize_analyticsreporting()
response = get_report(analytics, start_day, end_day, dim_list)
col_name = fetch_columns_name(response)
df = split_dimensions(response, col_name)
return df
if __name__ == "main":
main()
Since there is a limit to the amount of information that can be drawn at one time in the free version, it is acquired in four parts.
import pandas as pd
import fetch_analytics_data as fa
from database_connect import database_engine
import sys
from sqlalchemy import create_engine
# -----------------------------------------------------------------
dim_list1 = [
# Geo Network
{'name': 'ga:country'},
{'name': 'ga:city'},
{'name': "ga:continentId"},
## Platform of Device
{'name': "ga:browser"},
## Users
{'name': 'ga:userType'},
{'name': 'ga:sessionCount'},
{'name': 'ga:daysSinceLastSession'},
# {'name': 'ga:userDefinedValue'},
# {'name': 'ga:userBucket'},
## session
{'name': 'ga:sessionDurationBucket'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ----------------------------------------------------------------
dim_list2 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
# {'name':"ga:operatingSystem"},
# {'name':'ga:mobileDeviceBranding'},
# {'name':'ga:mobileDeviceModel'},
# {'name':"ga:mobileInputSelector"},
# {'name':"ga:mobileDeviceInfo"},
# {'name':'ga:mobileDeviceMarketingName'},
{'name': 'ga:deviceCategory'},
## Page Tracking
{'name': 'ga:pagePath'},
{'name': 'ga:exitPagePath'},
{'name': 'ga:pageDepth'},
# {'name':'ga:pageTitle'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ---------------------------------------------------------------
dim_list3 = [
# Geo Network
{'name': 'ga:city'},
## Traffic Sources
{'name': "ga:referralPath"},
# {'name': "ga:campaign"}, # all not set
{'name': "ga:source"},
{'name': "ga:medium"},
{'name': "ga:sourceMedium"},
{'name': "ga:hasSocialSourceReferral"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------------
dim_list4 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
{'name': "ga:operatingSystem"},
{'name': 'ga:mobileDeviceBranding'},
{'name': 'ga:mobileDeviceModel'},
{'name': "ga:mobileInputSelector"},
{'name': "ga:mobileDeviceInfo"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------
def database_engine():
db_url = '<Database_URL>'
engine = create_engine(db_url)
return engine
def main():
start_day = sys.argv[1]
end_day = sys.argv[1]
df1 = fa.main(start_day, end_day, dim_list1)
df2 = fa.main(start_day, end_day, dim_list2)
df3 = fa.main(start_day, end_day, dim_list3)
df4 = fa.main(start_day, end_day, dim_list4)
# merge
df = pd.merge(df1, df2, on=['city','dateHourMinute'], how='outer').merge(df3, on=['city','dateHourMinute'], how='outer').merge(df4, on=['city','dateHourMinute'], how='outer')
en = database_engine()
df.to_sql('analytics_table', con=en, if_exists='append', index=False)
if __name__ == '__main__':
main()
Recommended Posts