Google Analytics est un outil d'analyse d'accès à la mer fourni par google. Vous pouvez obtenir diverses informations telles que la zone d'accès, l'heure, l'appareil utilisé, le navigateur, etc. du visiteur du site Wed. Puisque l'interface graphique est également substantielle, il suffit de dire que c'est suffisant, mais comme l'API est également fournie gratuitement, j'ai essayé de faire une connexion API et de créer une base de données.
Activez l'API depuis la console API Google.
sudo pip install --upgrade google-api-python-client
Entrez la date et les informations que vous souhaitez obtenir dans la fonction principale et renvoyez le DataFrame.
'''fetch_analytics_data.py'''
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = '<REPLACE_WITH_JSON_FILE>'
VIEW_ID = '<ID_NUMBER>'
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics, start_day, end_day, dimension_list):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': start_day, 'endDate': end_day}],
# 'metrics': [{'expression': 'ga:sessions'}],
'dimensions': dimension_list
# [{'name': 'ga:country'},
# {'name': 'ga:browser'},
# {'name': 'ga:city'},
# {'name': 'ga:sex'}
# ]
}]
}
).execute()
def fetch_columns_name(response):
col_name = [i.lstrip('ga:') for i in list(list(list(response.values())[0][0].values())[0].values())[0]]
# col_name.append('visitor')
return col_name
def split_dimensions(response, col_name):
df = pd.json_normalize(response.get('reports'), sep=' ')["data rows"]
df = pd.json_normalize(df[0])
# split dimensions
for i in range(len(df['dimensions'][0])):
new_feature = [df['dimensions'][j][i] for j in range(df.shape[0])]
feature_name = col_name[i]
df[feature_name] = new_feature
df.drop(["dimensions"], axis=1, inplace=True)
# fetch visitor (not use now)
# df["visitor"] = [int(list(df['metrics'][i][0].values())[0][0]) for i in range(df.shape[0])]
df.drop(['metrics'], axis=1, inplace=True)
return df
def main(start_day, end_day, dim_list):
analytics = initialize_analyticsreporting()
response = get_report(analytics, start_day, end_day, dim_list)
col_name = fetch_columns_name(response)
df = split_dimensions(response, col_name)
return df
if __name__ == "main":
main()
Comme il y a une limite à la quantité d'informations qui peuvent être tirées en même temps dans la version gratuite, elles sont acquises en quatre parties.
import pandas as pd
import fetch_analytics_data as fa
from database_connect import database_engine
import sys
from sqlalchemy import create_engine
# -----------------------------------------------------------------
dim_list1 = [
# Geo Network
{'name': 'ga:country'},
{'name': 'ga:city'},
{'name': "ga:continentId"},
## Platform of Device
{'name': "ga:browser"},
## Users
{'name': 'ga:userType'},
{'name': 'ga:sessionCount'},
{'name': 'ga:daysSinceLastSession'},
# {'name': 'ga:userDefinedValue'},
# {'name': 'ga:userBucket'},
## session
{'name': 'ga:sessionDurationBucket'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ----------------------------------------------------------------
dim_list2 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
# {'name':"ga:operatingSystem"},
# {'name':'ga:mobileDeviceBranding'},
# {'name':'ga:mobileDeviceModel'},
# {'name':"ga:mobileInputSelector"},
# {'name':"ga:mobileDeviceInfo"},
# {'name':'ga:mobileDeviceMarketingName'},
{'name': 'ga:deviceCategory'},
## Page Tracking
{'name': 'ga:pagePath'},
{'name': 'ga:exitPagePath'},
{'name': 'ga:pageDepth'},
# {'name':'ga:pageTitle'},
## Time
{'name': 'ga:dateHourMinute'},
]
# ---------------------------------------------------------------
dim_list3 = [
# Geo Network
{'name': 'ga:city'},
## Traffic Sources
{'name': "ga:referralPath"},
# {'name': "ga:campaign"}, # all not set
{'name': "ga:source"},
{'name': "ga:medium"},
{'name': "ga:sourceMedium"},
{'name': "ga:hasSocialSourceReferral"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------------
dim_list4 = [
## Geo Network
{'name': 'ga:city'},
## Platform of Device
{'name': "ga:operatingSystem"},
{'name': 'ga:mobileDeviceBranding'},
{'name': 'ga:mobileDeviceModel'},
{'name': "ga:mobileInputSelector"},
{'name': "ga:mobileDeviceInfo"},
## Time
{'name': 'ga:dateHourMinute'},
]
# -----------------------------------------------------------
def database_engine():
db_url = '<Database_URL>'
engine = create_engine(db_url)
return engine
def main():
start_day = sys.argv[1]
end_day = sys.argv[1]
df1 = fa.main(start_day, end_day, dim_list1)
df2 = fa.main(start_day, end_day, dim_list2)
df3 = fa.main(start_day, end_day, dim_list3)
df4 = fa.main(start_day, end_day, dim_list4)
# merge
df = pd.merge(df1, df2, on=['city','dateHourMinute'], how='outer').merge(df3, on=['city','dateHourMinute'], how='outer').merge(df4, on=['city','dateHourMinute'], how='outer')
en = database_engine()
df.to_sql('analytics_table', con=en, if_exists='append', index=False)
if __name__ == '__main__':
main()