Skip to content

Commit

Permalink
[dataviz] feat: add graph Active contributors grouped by age (#1991)
Browse files Browse the repository at this point in the history
Co-authored-by: Gresille&Siffle <39056254+GresilleSiffle@users.noreply.github.com>
  • Loading branch information
NatNgs and GresilleSiffle committed Jul 22, 2024
1 parent f1f68bc commit 9f72b4e
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 1 deletion.
105 changes: 104 additions & 1 deletion data-visualization/pages/03_Community_evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@
Comparison:
A public comparison between two videos involves one or more quality criteria.
Active contributor (during a period of time):
An active contributor is a user who makes at least one public comparison during a given
period of time.
"""

import pandas as pd
import plotly.express as px
import streamlit as st
from dateutil.relativedelta import relativedelta
from utils import set_df

st.set_page_config(
Expand Down Expand Up @@ -49,7 +55,7 @@ def add_contributors_evolution():
st.plotly_chart(fig)

with total_tab:
fig = (df.groupby("public_username").first().groupby("week_date").size().cumsum().plot())
fig = df.groupby("public_username").first().groupby("week_date").size().cumsum().plot()
fig.update_xaxes(title="Week")
fig.update_yaxes(title="Total number of public contributors")
fig.update_layout(showlegend=False)
Expand Down Expand Up @@ -82,7 +88,104 @@ def add_comparisons_evolution():
st.plotly_chart(fig)


def add_comparisons_evolution_grouped_by_contributors_age():
"""
Display the number of active contributors per week, grouped by age.
"""

st.markdown("#### Active contributors grouped by age")
st.info(
"An active contributor is a user who makes at least one public comparison during a given"
" period of time.",
)

df = st.session_state.df

# Keep only the required data, remove duplicates.
df = df.drop_duplicates(subset=["public_username", "week_date"])[
["public_username", "week_date"]
].reset_index(drop=True)

# Categorize week_date to make them orderable.
df.week_date = df.week_date.cat.as_ordered()
overall_first_week = df.week_date.min()
overall_last_week = df.week_date.max()

# Categories: one category for each season between min(week_date) and max(week_date).
# A season is a period of 3 months.
seasons = pd.date_range(
start=f"{overall_first_week[:4]}-01-01",
end=overall_last_week,
freq="3M",
).to_list()

# For each season, create a new dataframe.
sub_dfs = []

# Generate a new dataframe. For each public_username,
# assign the season of their first public comparison.
users_seasons = (
df.groupby("public_username", as_index=False)
.min()
.rename(columns={"week_date": "first_week"})
)
last_user_weeks = (
df.groupby("public_username", as_index=False)
.max()
.rename(columns={"week_date": "last_week"})
)
users_seasons["last_week"] = last_user_weeks["last_week"]

# Add new column in users_seasons. The value is the minimum season such as the week_date is
# greater than the season date.
users_seasons["season"] = users_seasons.first_week.apply(
lambda first_week: max((s for s in seasons if s.isoformat() <= first_week), default=seasons[0])
).reindex()

# If user min week_date is same as user max week_date, change its season by 'single week'
# to differentiate them as they make a large group.
users_seasons.loc[
users_seasons.loc[users_seasons.first_week.eq(users_seasons.last_week)].index, "season"
] = "single week"
seasons_users = users_seasons.groupby("season")["public_username"].aggregate(list).to_dict()

for s in seasons_users:
# Filter df to keep only users of season s.
season_df = (
df.loc[df.public_username.isin(seasons_users[s])]
.groupby("week_date")
.public_username.nunique()
)

if s == "single week":
sub_dfs.append(("= last comparison date", season_df))
else:
category = s.strftime("%Y %b") + " to " + (s + relativedelta(months=2)).strftime("%b")
sub_dfs.append((category, season_df))

# Merge previous computed series into one, by week_date.
dtf = pd.DataFrame({"week_date": []}).reset_index()
for name, sub_df in sub_dfs:
dtf = pd.merge(dtf, sub_df.to_frame(name=name), on="week_date", how="outer").fillna(0)

fig = px.bar(
dtf,
x="week_date",
y=[name for name, _ in sub_dfs],
labels={
"value": "Active contributors",
"week_date": "Week",
"variable": "First comparison date",
},
color_discrete_sequence=px.colors.sample_colorscale("turbo", samplepoints=len(sub_dfs)),
color_discrete_map={"= last comparison date": "grey"},
)
fig.update_layout(legend={'traceorder': 'reversed'})
st.plotly_chart(fig)


pd.options.plotting.backend = "plotly"

add_contributors_evolution()
add_comparisons_evolution()
add_comparisons_evolution_grouped_by_contributors_age()
1 change: 1 addition & 0 deletions data-visualization/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ pillow==10.3.0
plotly==5.5.0
scikit-learn==1.5.0
requests==2.32.0
python-dateutil==2.9.0.post0

0 comments on commit 9f72b4e

Please sign in to comment.