Skip to content

Commit

Permalink
feat: add landing pages to analytics package (#4378) (#4384)
Browse files Browse the repository at this point in the history
* feat: added landing page analytics to package, refactored (#4378)

* chore: refactored util functions for sheets to a different file (#4378)

* chore: bumped setup.py (#4378)
  • Loading branch information
jpaten authored Feb 19, 2025
1 parent e92c50d commit 5a7502f
Show file tree
Hide file tree
Showing 5 changed files with 380 additions and 222 deletions.
107 changes: 107 additions & 0 deletions analytics/analytics_package/analytics/_sheets_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import datetime as dt
from .charts import get_data_df, get_df_over_time
from .entities import ADDITIONAL_DATA_BEHAVIOR
import numpy as np
import pandas as pd

def get_data_df_from_fields(metrics, dimensions, **other_params):
"""
Get a df from the Analytics API with metrics and dimensions as specified in fields.py
:param metrics: the metrics to get
:param dimensions: the dimensions to get
:param other_params: any other parameters to be passed to the get_data_df function, including service params
:return: a DataFrame with the data from the Analytics API.
The DF has an arbitrary RangeIndex,
string columns containing dimensions with names equal to the dimension alias value,
and int columns containing metrics with names equal to the metric alias value.
"""
df = get_data_df(
[metric["id"] for metric in metrics],
[dimension["id"] for dimension in dimensions],
**other_params
)
return df.reset_index().rename(columns=get_rename_dict(dimensions+metrics)).copy()


def get_rename_dict(dimensions):
"""Get a dictionary to rename the columns of a DataFrame."""
return dict(
zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions])
)


def get_one_period_change_series(series_current, series_previous, start_current, end_current, start_previous, end_previous):
"""
Get the percent change between two serieses, accounting for different numbers of days in the month.
:param series_current: the series representing the current month
:param series_previous: the series representing the prior month
:param start_current: the start date for the current month in the format "YYYY-MM-DD"
:param end_current: the end date for the current month
:param start_previous: the start date for the prior month
:param end_previous: the end date for the prior month
:return: a Series with the change between the two serieses
"""
# Check that both serieses have the same index names
assert series_current.index.names == series_previous.index.names
# Reindex both serieses to have the same index
combined_index = series_current.index.union(series_previous.index)
current_length = float((dt.datetime.fromisoformat(end_current) - dt.datetime.fromisoformat(start_current)).days + 1)
previous_length = float((dt.datetime.fromisoformat(end_previous) - dt.datetime.fromisoformat(start_previous)).days + 1)
assert current_length != 0 and previous_length != 0
series_current_reindexed = series_current.reindex(combined_index).fillna(0)
# Adjust the values from the prior series to account for the different number of days in the month
series_previous_reindexed = (series_previous.reindex(combined_index) * current_length / previous_length)
change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan})
return change


def get_change_over_time_df(
metrics, time_dimension, include_changes=True, additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params
):
"""
Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles
:param metrics: the metrics to be displayed
:param time_dimension: the time dimension to be used
:param include_changes: whether to include the percent change columns, defaults to True
:param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None
:param additional_data_behavior: the behavior to use when adding the additional data, defaults to None
:param strftime_format: the format to use for the time dimension, defaults to "%Y-%m". None means a datetime will be returned
:param other_params: any other parameters to be passed to the get_df_over_time function, including service params
:returns: a datetime with the values of the metrics for each time dimension.
Columns are the time dimension alias (as a datetime), metric aliases (as ints), and change metric aliases (as floats)
"""
df_api = get_df_over_time(
[metric["alias"] for metric in metrics],
[metric["id"] for metric in metrics],
time_dimension["id"],
sort_results=[time_dimension["id"]],
df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)),
format_table=False,
**other_params
).rename({time_dimension["id"]: time_dimension["alias"]})

df_combined = pd.DataFrame()

if additional_data_path is not None:
assert additional_data_behavior is not None
df_saved = pd.read_json(additional_data_path)
if additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.ADD:
df_combined = df_api.add(df_saved.astype(int), fill_value=0)[::-1]
elif additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.REPLACE:
df_combined = pd.concat([df_saved, df_api], ignore_index=False)
df_combined = df_combined.loc[~df_combined.index.duplicated(keep="first")].sort_index(ascending=False)
else:
df_combined = df_api

if include_changes:
df_combined[
[metric["change_alias"] for metric in metrics]
] = df_combined[
[metric["alias"] for metric in metrics]
].pct_change(periods=-1).replace({np.inf: np.nan})

if strftime_format is not None:
df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format)

return df_combined.reset_index(names=time_dimension["alias"])
100 changes: 100 additions & 0 deletions analytics/analytics_package/analytics/entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Metric names
# The number of events that occur
from enum import Enum

# The number of events that occur
METRIC_EVENT_COUNT = {
"id": "eventCount",
"alias": "Event Count",
"change_alias": "Event Count Change",
}
# The total number of users that trigger an event
# Includes users who visit very briefly and do not interact with the site
# See https://support.google.com/analytics/answer/12253918?hl=en
METRIC_TOTAL_USERS = {
"id": "totalUsers",
"alias": "Total Users",
"change_alias": "Total Users Change",
}
# The number of active users as defined by GA4
# See https://support.google.com/analytics/answer/12253918?hl=en
METRIC_ACTIVE_USERS = {
"id": "activeUsers",
"alias": "Users",
"change_alias": "Active Users Change",
}
# The number of page views
METRIC_PAGE_VIEWS = {
"id": "screenPageViews",
"alias": "Total Pageviews",
"change_alias": "Total Pageviews Change",
}
# The number of sessions
METRIC_SESSIONS = {
"id": "sessions",
"alias": "Sessions",
"change_alias": "Sessions Change",
}
# The total number of clicks on outbound links. Generated from other metrics, so does not have an id field
SYNTHETIC_METRIC_CLICKS = {
"id": None,
"alias": "Total Clicks",
"change_alias": "Total Clicks Change",
}

# Event Names
# The builtin outbound link click event. Stores the clicked URL in DIMENSION_BUILTIN_URL
# Triggers under some circumstances where custom click does not, but does not include url fragments in any dimensions
EVENT_BUILTIN_CLICK = "click"
# The custom outbound link click event. Stores the clicked URL DIMENSION_CUSTOM_URL
# Includes url fragments, sometimes has a slightly different count to the built in click event
EVENT_CUSTOM_CLICK = "outbound_link_clicked"
# The builtin page view event.
EVENT_PAGE_VIEW = "page_view"

# DIMENSIONS
# The path to the page the user is on when the event occurs. Does not include fragments or parameters
DIMENSION_PAGE_PATH = {
"id": "pagePath",
"alias": "Page Path",
}
# The url of the clicked link, only returned in EVENT_BUILTIN_CLICK. Does not include URL fragments
DIMENSION_BUILTIN_URL = {
"id": "linkUrl",
"alias": "URL",
}
# The name of the event. See GA4 docs for event names
DIMENSION_EVENT_NAME = {
"id": "eventName",
"alias": "Event Name",
}
# The url of the clicked link, only returned in EVENT_CUSTOM_CLICK. Includes URL fragments.
DIMENSION_CUSTOM_URL = {
"id": "customEvent:click_url",
"alias": "Outbound URL",
}
# The landing page for a session
DIMENSION_LANDING_PAGE = {
"id": "landingPage",
"alias": "Landing Page",
}
# The current month in the format YYYYMM
DIMENSION_YEAR_MONTH = {
"id": "yearMonth",
"alias": "Month",
}
# The hostname of the clicked link. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL
SYNTHETIC_DIMENSION_CLICKED_HOSTNAME = {
"id": None,
"alias": "Clicked Hostname",
}
# The complete clicked link, including hostname, parameters, fragments, and prefix. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL
SYNTHETIC_DIMENSION_CLICKED_LINK = {
"id": None,
"alias": "Outbound Link",
}

# Used as arguments in get_change_over_time_df
class ADDITIONAL_DATA_BEHAVIOR(Enum):
ADD = "add" # Sum the cached data with the api data
REPLACE = "replace"# Replace the api data with the cached data
27 changes: 0 additions & 27 deletions analytics/analytics_package/analytics/fields.py

This file was deleted.

Loading

0 comments on commit 5a7502f

Please sign in to comment.