From 5a7502f93d4b94ab58674d51f3e6b8adc47fdfb7 Mon Sep 17 00:00:00 2001 From: Jonah Paten Date: Tue, 18 Feb 2025 17:40:13 -0800 Subject: [PATCH] feat: add landing pages to analytics package (#4378) (#4384) * feat: added landing page analytics to package, refactored (#4378) * chore: refactored util functions for sheets to a different file (#4378) * chore: bumped setup.py (#4378) --- .../analytics/_sheets_utils.py | 107 +++++ .../analytics_package/analytics/entities.py | 100 +++++ .../analytics_package/analytics/fields.py | 27 -- .../analytics/sheets_elements.py | 366 ++++++++---------- analytics/analytics_package/setup.py | 2 +- 5 files changed, 380 insertions(+), 222 deletions(-) create mode 100644 analytics/analytics_package/analytics/_sheets_utils.py create mode 100644 analytics/analytics_package/analytics/entities.py delete mode 100644 analytics/analytics_package/analytics/fields.py diff --git a/analytics/analytics_package/analytics/_sheets_utils.py b/analytics/analytics_package/analytics/_sheets_utils.py new file mode 100644 index 000000000..e5c1bf194 --- /dev/null +++ b/analytics/analytics_package/analytics/_sheets_utils.py @@ -0,0 +1,107 @@ +import datetime as dt +from .charts import get_data_df, get_df_over_time +from .entities import ADDITIONAL_DATA_BEHAVIOR +import numpy as np +import pandas as pd + +def get_data_df_from_fields(metrics, dimensions, **other_params): + """ + Get a df from the Analytics API with metrics and dimensions as specified in fields.py + + :param metrics: the metrics to get + :param dimensions: the dimensions to get + :param other_params: any other parameters to be passed to the get_data_df function, including service params + :return: a DataFrame with the data from the Analytics API. + The DF has an arbitrary RangeIndex, + string columns containing dimensions with names equal to the dimension alias value, + and int columns containing metrics with names equal to the metric alias value. + """ + df = get_data_df( + [metric["id"] for metric in metrics], + [dimension["id"] for dimension in dimensions], + **other_params + ) + return df.reset_index().rename(columns=get_rename_dict(dimensions+metrics)).copy() + + +def get_rename_dict(dimensions): + """Get a dictionary to rename the columns of a DataFrame.""" + return dict( + zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) + ) + + +def get_one_period_change_series(series_current, series_previous, start_current, end_current, start_previous, end_previous): + """ + Get the percent change between two serieses, accounting for different numbers of days in the month. + :param series_current: the series representing the current month + :param series_previous: the series representing the prior month + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the prior month + :param end_previous: the end date for the prior month + :return: a Series with the change between the two serieses + """ + # Check that both serieses have the same index names + assert series_current.index.names == series_previous.index.names + # Reindex both serieses to have the same index + combined_index = series_current.index.union(series_previous.index) + current_length = float((dt.datetime.fromisoformat(end_current) - dt.datetime.fromisoformat(start_current)).days + 1) + previous_length = float((dt.datetime.fromisoformat(end_previous) - dt.datetime.fromisoformat(start_previous)).days + 1) + assert current_length != 0 and previous_length != 0 + series_current_reindexed = series_current.reindex(combined_index).fillna(0) + # Adjust the values from the prior series to account for the different number of days in the month + series_previous_reindexed = (series_previous.reindex(combined_index) * current_length / previous_length) + change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan}) + return change + + +def get_change_over_time_df( + metrics, time_dimension, include_changes=True, additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params +): + """ + Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles + :param metrics: the metrics to be displayed + :param time_dimension: the time dimension to be used + :param include_changes: whether to include the percent change columns, defaults to True + :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None + :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None + :param strftime_format: the format to use for the time dimension, defaults to "%Y-%m". None means a datetime will be returned + :param other_params: any other parameters to be passed to the get_df_over_time function, including service params + :returns: a datetime with the values of the metrics for each time dimension. + Columns are the time dimension alias (as a datetime), metric aliases (as ints), and change metric aliases (as floats) + """ + df_api = get_df_over_time( + [metric["alias"] for metric in metrics], + [metric["id"] for metric in metrics], + time_dimension["id"], + sort_results=[time_dimension["id"]], + df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)), + format_table=False, + **other_params + ).rename({time_dimension["id"]: time_dimension["alias"]}) + + df_combined = pd.DataFrame() + + if additional_data_path is not None: + assert additional_data_behavior is not None + df_saved = pd.read_json(additional_data_path) + if additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.ADD: + df_combined = df_api.add(df_saved.astype(int), fill_value=0)[::-1] + elif additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.REPLACE: + df_combined = pd.concat([df_saved, df_api], ignore_index=False) + df_combined = df_combined.loc[~df_combined.index.duplicated(keep="first")].sort_index(ascending=False) + else: + df_combined = df_api + + if include_changes: + df_combined[ + [metric["change_alias"] for metric in metrics] + ] = df_combined[ + [metric["alias"] for metric in metrics] + ].pct_change(periods=-1).replace({np.inf: np.nan}) + + if strftime_format is not None: + df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format) + + return df_combined.reset_index(names=time_dimension["alias"]) \ No newline at end of file diff --git a/analytics/analytics_package/analytics/entities.py b/analytics/analytics_package/analytics/entities.py new file mode 100644 index 000000000..851002eec --- /dev/null +++ b/analytics/analytics_package/analytics/entities.py @@ -0,0 +1,100 @@ +# Metric names +# The number of events that occur +from enum import Enum + +# The number of events that occur +METRIC_EVENT_COUNT = { + "id": "eventCount", + "alias": "Event Count", + "change_alias": "Event Count Change", +} +# The total number of users that trigger an event +# Includes users who visit very briefly and do not interact with the site +# See https://support.google.com/analytics/answer/12253918?hl=en +METRIC_TOTAL_USERS = { + "id": "totalUsers", + "alias": "Total Users", + "change_alias": "Total Users Change", +} +# The number of active users as defined by GA4 +# See https://support.google.com/analytics/answer/12253918?hl=en +METRIC_ACTIVE_USERS = { + "id": "activeUsers", + "alias": "Users", + "change_alias": "Active Users Change", +} +# The number of page views +METRIC_PAGE_VIEWS = { + "id": "screenPageViews", + "alias": "Total Pageviews", + "change_alias": "Total Pageviews Change", +} +# The number of sessions +METRIC_SESSIONS = { + "id": "sessions", + "alias": "Sessions", + "change_alias": "Sessions Change", +} +# The total number of clicks on outbound links. Generated from other metrics, so does not have an id field +SYNTHETIC_METRIC_CLICKS = { + "id": None, + "alias": "Total Clicks", + "change_alias": "Total Clicks Change", +} + +# Event Names +# The builtin outbound link click event. Stores the clicked URL in DIMENSION_BUILTIN_URL +# Triggers under some circumstances where custom click does not, but does not include url fragments in any dimensions +EVENT_BUILTIN_CLICK = "click" +# The custom outbound link click event. Stores the clicked URL DIMENSION_CUSTOM_URL +# Includes url fragments, sometimes has a slightly different count to the built in click event +EVENT_CUSTOM_CLICK = "outbound_link_clicked" +# The builtin page view event. +EVENT_PAGE_VIEW = "page_view" + +# DIMENSIONS +# The path to the page the user is on when the event occurs. Does not include fragments or parameters +DIMENSION_PAGE_PATH = { + "id": "pagePath", + "alias": "Page Path", +} +# The url of the clicked link, only returned in EVENT_BUILTIN_CLICK. Does not include URL fragments +DIMENSION_BUILTIN_URL = { + "id": "linkUrl", + "alias": "URL", +} +# The name of the event. See GA4 docs for event names +DIMENSION_EVENT_NAME = { + "id": "eventName", + "alias": "Event Name", +} +# The url of the clicked link, only returned in EVENT_CUSTOM_CLICK. Includes URL fragments. +DIMENSION_CUSTOM_URL = { + "id": "customEvent:click_url", + "alias": "Outbound URL", +} +# The landing page for a session +DIMENSION_LANDING_PAGE = { + "id": "landingPage", + "alias": "Landing Page", +} +# The current month in the format YYYYMM +DIMENSION_YEAR_MONTH = { + "id": "yearMonth", + "alias": "Month", +} +# The hostname of the clicked link. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL +SYNTHETIC_DIMENSION_CLICKED_HOSTNAME = { + "id": None, + "alias": "Clicked Hostname", +} +# The complete clicked link, including hostname, parameters, fragments, and prefix. Based on DIMENSION_CUSTOM_URL and DIMENSION_BUILTIN_URL +SYNTHETIC_DIMENSION_CLICKED_LINK = { + "id": None, + "alias": "Outbound Link", +} + +# Used as arguments in get_change_over_time_df +class ADDITIONAL_DATA_BEHAVIOR(Enum): + ADD = "add" # Sum the cached data with the api data + REPLACE = "replace"# Replace the api data with the cached data \ No newline at end of file diff --git a/analytics/analytics_package/analytics/fields.py b/analytics/analytics_package/analytics/fields.py deleted file mode 100644 index 3b31a5272..000000000 --- a/analytics/analytics_package/analytics/fields.py +++ /dev/null @@ -1,27 +0,0 @@ -# Metric names -METRIC_EVENT_COUNT = 'eventCount' -METRIC_TOTAL_USERS = 'totalUsers' -METRIC_PAGE_VIEW = 'screenPageViews' - -# Event Names -EVENT_BUILTIN_CLICK = "click" -EVENT_CUSTOM_CLICK = "outbound_link_clicked" -EVENT_PAGE_VIEW = "page_view" - -# DIMENSIONS -DIMENSION_PAGE_PATH = { - 'id': 'pagePath', - 'alias': 'page_path', -} -DIMENSION_BUILTIN_URL = { - 'id': 'linkUrl', - 'alias': 'builtin_url', -} -DIMENSION_EVENT_NAME = { - 'id': 'eventName', - 'alias': 'event_name', -} -DIMENSION_CUSTOM_URL = { - 'id': 'customEvent:click_url', - 'alias': 'outbound_url', -} diff --git a/analytics/analytics_package/analytics/sheets_elements.py b/analytics/analytics_package/analytics/sheets_elements.py index 8a6f0f3e9..0ff992cb0 100644 --- a/analytics/analytics_package/analytics/sheets_elements.py +++ b/analytics/analytics_package/analytics/sheets_elements.py @@ -1,45 +1,24 @@ -from enum import Enum -import numpy as np import pandas as pd -from .charts import get_data_df, get_df_over_time -from .fields import * -from urllib.parse import urlparse -import datetime as dt - -def get_flat_data_df(metrics, dimensions, **other_params): - """ - Get a df from the Analytics API with a flat structure (no multiindex). - :param analytics_params: the parameters for the Analytics API, including authentication and property ids - :param metrics: the metrics to get - :param dimensions: the dimensions to get - :return: a DataFrame with the data from the Analytics API - """ - df = get_data_df( - metrics, - [dimension["id"] for dimension in dimensions], - **other_params, - ) - return df.reset_index().rename(columns=get_rename_dict(dimensions)).copy() - -def get_rename_dict(dimensions): - """Get a dictionary to rename the columns of a DataFrame.""" - return dict( - zip([dimension["id"] for dimension in dimensions], [dimension["alias"] for dimension in dimensions]) - ) +from ._sheets_utils import * +from .entities import * +from urllib.parse import urlparse -def get_outbound_links_df(analytics_params): +def get_outbound_links_df(analytics_params, ignore_index=True): """ - Get a DF with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. + Get a DataFrame with outbound links from the Analytics API. Merges the builtin and custom events for outbound links. analytics_params cannot currently include a dimension_filter :param analytics_params: the parameters for the Analytics API, including authentication and property ids - :return: a DataFrame with the outbound links from the Analytics API + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Dimensions: DIMENSION_PAGE_PATH, SYNTHETIC_DIMENSION_CLICKED_HOSTNAME, SYNTHETIC_DIMENSION_CLICKED_LINK + Metrics: SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS """ pd.set_option('future.no_silent_downcasting', True) assert "dimension_filter" not in analytics_params # Get the builtin "Click" event - df_builtin_links = get_flat_data_df( + df_builtin_links =get_data_df_from_fields( [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], [DIMENSION_PAGE_PATH, DIMENSION_BUILTIN_URL, DIMENSION_EVENT_NAME], dimension_filter=f"eventName=={EVENT_BUILTIN_CLICK}", @@ -48,7 +27,7 @@ def get_outbound_links_df(analytics_params): [DIMENSION_PAGE_PATH["alias"], DIMENSION_BUILTIN_URL["alias"]] ).sum().reset_index() # Get the custom "outbound_link_click" event - df_custom_links = get_flat_data_df( + df_custom_links = get_data_df_from_fields( [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS], [DIMENSION_EVENT_NAME, DIMENSION_CUSTOM_URL, DIMENSION_PAGE_PATH], dimension_filter=f"eventName=={EVENT_CUSTOM_CLICK}", @@ -69,105 +48,128 @@ def get_outbound_links_df(analytics_params): # Use the builtin link, unless the link is not in the custom links, in which case use the custom link df_all_links = df_all_links.loc[ ~(df_all_links["truncated_url"].isin(df_outbound_links_fragments["truncated_url"]) & df_all_links["builtin"]) - ].sort_values(METRIC_EVENT_COUNT, ascending=False) + ].sort_values(METRIC_EVENT_COUNT["alias"], ascending=False) df_all_links["is_fragment"] = df_all_links["is_fragment"].fillna(False).astype(bool) # Use the builtin link, unless the link is a fragment, in which case use the custom link - df_all_links["complete_url"] = df_all_links["builtin_url"].where( + df_all_links["complete_url"] = df_all_links[DIMENSION_BUILTIN_URL["alias"]].where( ~df_all_links["is_fragment"], - df_all_links["outbound_url"] + df_all_links[DIMENSION_CUSTOM_URL["alias"]] ) df_all_links["hostname"] = df_all_links["complete_url"].map(lambda x: urlparse(x).hostname) + dimension_aliases_to_keep = [ + DIMENSION_PAGE_PATH["alias"], + SYNTHETIC_DIMENSION_CLICKED_LINK["alias"], + SYNTHETIC_DIMENSION_CLICKED_HOSTNAME["alias"], + ] + metric_aliases_to_keep = [ + SYNTHETIC_METRIC_CLICKS["alias"], + METRIC_TOTAL_USERS["alias"], + ] df_all_links = df_all_links.drop( - columns=["builtin_url", "outbound_url", "builtin", "is_fragment"] + columns=[DIMENSION_BUILTIN_URL["alias"], DIMENSION_CUSTOM_URL["alias"], "builtin", "is_fragment"] ).rename( columns={ - DIMENSION_PAGE_PATH["alias"]: "Page Path", - "complete_url": "Outbound Link", - METRIC_EVENT_COUNT: "Total Clicks", - METRIC_TOTAL_USERS: "Total Users", - "hostname": "Hostname", + "complete_url": SYNTHETIC_DIMENSION_CLICKED_LINK["alias"], + METRIC_EVENT_COUNT["alias"]: SYNTHETIC_METRIC_CLICKS["alias"], + "hostname": SYNTHETIC_DIMENSION_CLICKED_HOSTNAME["alias"], } - )[["Page Path", "Hostname", "Outbound Link", "Total Clicks", "Total Users"]] + )[[ + *dimension_aliases_to_keep, *metric_aliases_to_keep + ]].copy() - return df_all_links.copy().reset_index(drop=True) + if not ignore_index: + return df_all_links.set_index(dimension_aliases_to_keep) + else: + return df_all_links.reset_index(drop=True) def get_outbound_links_change(analytics_params, start_current, end_current, start_previous, end_previous): """ - Get a DF with outbound links from the Analytics API and a comparison for the prior month + Get a DataFrame with outbound links from the Analytics API and a comparison for the prior period + :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month + :return: a DataFrame with the outbound links from the Analytics API. + By default, dimensions and metrics both form columns. + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_PAGE_PATH, SYNTHETIC_DIMENSION_CLICKED_HOSTNAME, SYNTHETIC_DIMENSION_CLICKED_LINK + Metrics: SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS """ - analytics_params_month_1 = { - **analytics_params, - "start_date": start_current, - "end_date": end_current, - } - analytics_params_month_2 = { - **analytics_params, - "start_date": start_previous, - "end_date": end_previous, - } - df_current = get_outbound_links_df(analytics_params_month_1).set_index( - ["Page Path", "Outbound Link", "Hostname"] - ) - df_previous = get_outbound_links_df(analytics_params_month_2).set_index( - ["Page Path", "Outbound Link", "Hostname"] - ) - total_clicks_percent_change = get_change( - df_current["Total Clicks"], - df_previous["Total Clicks"], - start_current, - end_current, - start_previous, - end_previous - ) - total_users_percent_change = get_change( - df_current["Total Users"], - df_previous["Total Users"], - start_current, - end_current, - start_previous, - end_previous + return get_one_period_change_df( + get_outbound_links_df, + [SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS], + analytics_params, + start_current, + end_current, + start_previous, + end_previous, + sort_results=[SYNTHETIC_METRIC_CLICKS, METRIC_TOTAL_USERS] ) - df_reindexed = df_current.reindex(total_clicks_percent_change.index).fillna(0) - df_reindexed["Total Clicks Percent Change"] = total_clicks_percent_change - df_reindexed["Total Users Percent Change"] = total_users_percent_change - return df_reindexed.sort_values(["Total Clicks", "Total Users"], ascending=False, kind="stable").reset_index() -def get_page_views_df(analytics_params): +def get_page_views_df(analytics_params, ignore_index=False): """ - Get a DF with page views from the Analytics API. + Get a DataFrame with page views from the Analytics API :param analytics_params: the parameters for the Analytics API, including authentication and property ids - :return: a DataFrame with the page views from the Analytics API + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Dimensions: DIMENSION_PAGE_PATH + Metrics: METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS """ assert "dimension_filter" not in analytics_params - df_response = get_flat_data_df( - [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS, METRIC_PAGE_VIEW], + df_response = get_data_df_from_fields( + [METRIC_EVENT_COUNT, METRIC_TOTAL_USERS, METRIC_PAGE_VIEWS], [DIMENSION_PAGE_PATH, DIMENSION_EVENT_NAME], **analytics_params, dimension_filter=f"eventName=={EVENT_PAGE_VIEW}", - ).rename( - columns={ - DIMENSION_PAGE_PATH["alias"]: "Page Path", - METRIC_PAGE_VIEW: "Total Views", - METRIC_TOTAL_USERS: "Total Users", - } - )[["Page Path", "Total Views", "Total Users"]].copy() + )[[DIMENSION_PAGE_PATH["alias"], METRIC_PAGE_VIEWS["alias"], METRIC_TOTAL_USERS["alias"]]].copy() + if not ignore_index: + df_response = df_response.set_index(DIMENSION_PAGE_PATH["alias"]) return df_response def get_page_views_change(analytics_params, start_current, end_current, start_previous, end_previous): """ - Get a DF with page views from the Analytics API and a comparison for the prior month + Get a DataFrame with page views from the Analytics API and a comparison for the prior month + :param analytics_params: the parameters for the Analytics API, including authentication and property ids :param start_current: the start date for the current month in the format "YYYY-MM-DD" :param end_current: the end date for the current month :param start_previous: the start date for the previous month :param end_previous: the end date for the previous month + :return: a DataFrame with the response from the Analytics API. By default, dimensions and metrics both form columns + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_PAGE_PATH + Metrics: METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS + """ + return get_one_period_change_df( + get_page_views_df, + [METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS], + analytics_params, + start_current, + end_current, + start_previous, + end_previous, + sort_results=[METRIC_PAGE_VIEWS, METRIC_TOTAL_USERS] + ) + +def get_one_period_change_df(df_function, change_metrics, analytics_params, start_current, end_current, start_previous, end_previous, sort_results=None, ignore_index=False): """ + Get a DataFrame with the change between two periods for the given metrics, renamed to match titles + :param df_function: a function that returns a dataframe, with numerical columns matching the aliases of change_metrics + :param change_metrics: an iterable of the objects representing metrics to be displayed + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the prior month + :param end_previous: the end date for the prior month + :param sort_results: an iterable containing the metrics to sort the results by, defaults to None + :param ignore_index: if true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the change between two periods for the given metrics, renamed to match titles + Columns are dimension aliases (as strings), metric aliases (as ints), and metric change aliases (as floats) + """ + analytics_params_current = { **analytics_params, "start_date": start_current, @@ -178,124 +180,100 @@ def get_page_views_change(analytics_params, start_current, end_current, start_pr "start_date": start_previous, "end_date": end_previous, } - df_current = get_page_views_df(analytics_params_current).set_index("Page Path") - df_previous = get_page_views_df(analytics_params_previous).set_index("Page Path") - combined_index = df_current.index.union(df_previous.index) - df_current_reindexed = df_current.reindex(combined_index).fillna(0) - df_previous_reindexed = df_previous.reindex(combined_index) - views_percent_change = get_change( - df_current_reindexed["Total Views"], - df_previous_reindexed["Total Views"], - start_current, - end_current, - start_previous, - end_previous, + + df_current = df_function( + analytics_params_current, + ignore_index=False ) - users_percent_change = get_change( - df_current_reindexed["Total Users"], - df_previous_reindexed["Total Users"], - start_current, - end_current, - start_previous, - end_previous, + df_previous = df_function( + analytics_params_previous, + ignore_index=False ) - df_reindexed = df_current.reindex(views_percent_change.index).fillna(0) - df_reindexed["Total Views Percent Change"] = views_percent_change - df_reindexed["Total Users Percent Change"] = users_percent_change - return df_reindexed.sort_values(["Total Views", "Total Users"], ascending=False, kind="stable").reset_index() - -def get_change(series_current, series_previous, start_current, end_current, start_previous, end_previous, combined_index = None): - """ - Get the percent change between two serieses, accounting for different numbers of days in the month. - :param series_current: the series representing the current month - :param series_previous: the series representing the prior month - :param start_current: the start date for the current month in the format "YYYY-MM-DD" - :param end_current: the end date for the current month - :param start_previous: the start date for the prior month - :param end_previous: the end date for the prior month - :return: a Series with the change between the two serieses - """ - # Check that both serieses have the same index names - assert series_current.index.names == series_previous.index.names - # Reindex both serieses to have the same index - combined_index = series_current.index.union(series_previous.index) - current_length = float((dt.datetime.fromisoformat(end_current) - dt.datetime.fromisoformat(start_current)).days + 1) - previous_length = float((dt.datetime.fromisoformat(end_previous) - dt.datetime.fromisoformat(start_previous)).days + 1) - assert current_length != 0 and previous_length != 0 - series_current_reindexed = series_current.reindex(combined_index).fillna(0) - # Adjust the values from the prior series to account for the different number of days in the month - series_previous_reindexed = (series_previous.reindex(combined_index) * current_length / previous_length) - change = ((series_current_reindexed / series_previous_reindexed) - 1).replace({np.inf: np.nan}) - return change - -class ADDITIONAL_DATA_BEHAVIOR(Enum): - ADD = "add" - REPLACE = "replace" + df_changes = pd.concat( + [ + get_one_period_change_series( + df_current[metric["alias"]], df_previous[metric["alias"]], start_current, end_current, start_previous, end_previous + ) for metric in change_metrics + ], + axis=1, + ).rename( + columns={metric["alias"]: metric["change_alias"] for metric in change_metrics} + ) + df_current_with_changes = pd.concat( + [df_current.reindex(df_changes.index).fillna(0), df_changes], + axis=1 + ) + if sort_results: + df_current_with_changes = df_current_with_changes.sort_values( + [metric["alias"] for metric in sort_results], ascending=False, kind="stable" + ) + if ignore_index: + return df_current_with_changes + else: + return df_current_with_changes.reset_index() def get_page_views_over_time_df(analytics_params, additional_data_path=None, additional_data_behavior=None): """ Get a DataFrame with pageviews and total active users over time from the Analytics API. + :param analytics_params: the parameters for the Analytics API, including service params, start dates, and end dates :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None - :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None + :param additional_data_behavior: the behavior to use when adding the additional data, as an instance of ADDITIONAL_DATA_BEHAVIOR, defaults to None + :return: a DataFrame with the pageviews and total active users over time from the Analytics API. + Columns are the dimension aliases, metrics (as ints), and change metrics (as floats) + Dimensions: DIMENSION_YEAR_MONTH (as a datetime) + Metrics: METRIC_ACTIVE_USERS, METRIC_PAGE_VIEWS """ return get_change_over_time_df( - ["Users", "Total Pageviews"], - ["activeUsers", "screenPageViews"], - ["Month"], - "yearMonth", + [METRIC_ACTIVE_USERS, METRIC_PAGE_VIEWS], + DIMENSION_YEAR_MONTH, additional_data_path=additional_data_path, additional_data_behavior=additional_data_behavior, **analytics_params ) -def get_change_over_time_df( - metric_titles, metrics, time_title, time_dimension, include_changes=True, change_title_suffix = " Change", additional_data_path=None, additional_data_behavior=None, strftime_format="%Y-%m", **other_params -): +def get_landing_page_df(analytics_params, ignore_index=True): """ - Get a DataFrame with the change over time for the given metrics, renamed to match metric_titles - :param metric_titles: the titles of the metrics to be displayed - :param metrics: the metrics to be displayed - :param time_title: the title to be displayed for the time dimension - :param time_dimension: the time dimension to be displayed - :param include_changes: whether to include the percent change columns, defaults to True - :param change_title_suffix: the suffix to be added to the change columns, defaults to " Change" - :param additional_data_path: the path to a JSON file with additional data to be added to the DataFrame, defaults to None - :param additional_data_behavior: the behavior to use when adding the additional data, defaults to None - :param strftime_format: the format to use for the time dimension, defaults to "%Y-%m". None means a datetime will be returned - :param other_params: any other parameters to be passed to the get_df_over_time function, including service params - """ - df_api = get_df_over_time( - metric_titles, - metrics, - time_dimension, - sort_results=[time_dimension], - df_processor=(lambda df: df.set_index(df.index + "01").sort_index(ascending=False)), - format_table=False, - **other_params - ) - - df_combined = pd.DataFrame() - - if additional_data_path is not None: - assert additional_data_behavior is not None - df_saved = pd.read_json(additional_data_path) - if additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.ADD: - df_combined = df_api.add(df_saved.astype(int), fill_value=0)[::-1] - elif additional_data_behavior == ADDITIONAL_DATA_BEHAVIOR.REPLACE: - df_combined = pd.concat([df_saved, df_api], ignore_index=False) - df_combined = df_combined.loc[~df_combined.index.duplicated(keep="first")].sort_index(ascending=False) - else: - df_combined = df_api - - if include_changes: - assert change_title_suffix is not None - df_combined[ - [f"{title}{change_title_suffix}" for title in metric_titles] - ] = df_combined[metric_titles].pct_change(periods=-1).replace({np.inf: np.nan}) + Get a DataFrame with landing pages from the Analytics API. - if strftime_format is not None: - df_combined.index = pd.to_datetime(df_combined.index).strftime(strftime_format) + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param ignore_index: If true, the index will be an arbitrary range index. If false, the index will be the dimensions + :return: a DataFrame with the landing pages from the Analytics API + By default, dimension and metric aliases both form columns + Dimensions: DIMENSION_LANDING_PAGE, + Metrics: METRIC_SESSIONS + """ + df_response = get_data_df_from_fields( + [METRIC_SESSIONS], + [DIMENSION_LANDING_PAGE], + **analytics_params, + )[[DIMENSION_LANDING_PAGE["alias"], METRIC_SESSIONS["alias"]]].copy() + if not ignore_index: + df_response = df_response.set_index(DIMENSION_LANDING_PAGE["alias"]) + return df_response - return df_combined.reset_index(names=time_title) +def get_landing_page_change(analytics_params, start_current, end_current, start_previous, end_previous): + """ + Get a DataFrame with landing pages from the Analytics API and a comparison for the prior month + :param analytics_params: the parameters for the Analytics API, including authentication and property ids + :param start_current: the start date for the current month in the format "YYYY-MM-DD" + :param end_current: the end date for the current month + :param start_previous: the start date for the previous month + :param end_previous: the end date for the previous month + :return: a DataFrame with the landing pages from the Analytics API. + By default, dimensions and metrics both form columns + Columns are present for both metric values and metric changes from the prior period + Dimensions: DIMENSION_LANDING_PAGE + Metrics: METRIC_SESSIONS + """ + return get_one_period_change_df( + get_landing_page_df, + [METRIC_SESSIONS], + analytics_params, + start_current, + end_current, + start_previous, + end_previous, + sort_results=[METRIC_SESSIONS] + ) \ No newline at end of file diff --git a/analytics/analytics_package/setup.py b/analytics/analytics_package/setup.py index 7dbbcc622..be39c427d 100644 --- a/analytics/analytics_package/setup.py +++ b/analytics/analytics_package/setup.py @@ -2,7 +2,7 @@ setup( name="analytics", - version="3.4.1", + version="4.0.0", packages=["analytics"], install_requires=["matplotlib", "pandas", "numpy", "google-auth-oauthlib", "google-api-python-client", "gspread", "gspread-formatting"], ) \ No newline at end of file