コホート分析は、マーケティングで非常に人気があります。その人気は、アルゴリズムと計算の容易さによる可能性が最も高いです。基本的には深刻な数学的概念はなく、Excelで実行される初等数学です。洞察を得るという観点から、生存の分析ははるかに興味深いものです。
それにもかかわらず、私たちはそのような課題があり、それを解決しなければならないと信じています。パッケージや既製の関数を検索するのは面白くありません。計算は簡単で、多くの設定があります。以下は、実装の可能な例(実行速度に特別な固定を行わない場合)であり、コード全体が数十行になります。
これは、以前の一連の出版物の続きです。
いくつかのコード
テストセットを作成するとき、特にタイムゾーンに焦点を当てない場合があります。それでも、データはランダムです。
テストケースの作成
# 15
set.seed(42)
events_dt <- tibble(user_id = 1000:9000) %>%
mutate(birthday = Sys.Date() + as.integer(rexp(n(), 1/10))) %>%
rowwise() %>%
mutate(timestamp = list(as_datetime(birthday) + 24*60*60 * (
rexp(10^3, rate = 1/runif(1, 2, 25))))) %>%
ungroup() %>%
unnest(timestamp) %>%
#
filter(timestamp >= quantile(timestamp, probs = 0.1),
timestamp <= quantile(timestamp, probs = 0.95)) %>%
mutate(date = as_date(timestamp)) %>%
select(user_id, date) %>%
setDT(key = c("user_id", "date")) %>%
#
unique()
結果の累積分布を見てみましょう
ggplot(events_dt, aes(date)) + geom_histogram()
ステップ1.ユーザーガイドの作成
" ", .. , . data.table
.
users_dict <- events_dt[, .(birthday = head(date, 1)), by = user_id] %>%
#
.[, week_start := floor_date(.BY[[1]], unit = "week"), by = birthday] %>%
#
.[, cohort := stri_c(
lubridate::isoyear(.BY[[1]]),
sprintf("%02d", lubridate::isoweek(.BY[[1]])),
sep = "/"), by = week_start]
# ,
as_tibble(janitor::tabyl(users_dict, birthday))
2.
.
. .
data.frame
cohort_dict <- unique(users_dict[, .(cohort, week_start)])
cohort_tbl <- users_dict[events_dt, on = "user_id"] %>%
#
.[, rel_week := floor(as.numeric(difftime(date, birthday, units = "week")))] %>%
# 10
.[rel_week <= 9] %>%
#
unique(by = c("user_id", "cohort", "rel_week")) %>%
#
.[, .N, by = .(cohort, rel_week)] %>%
.[, rate := N/max(N), by = cohort]
3.
1. ggplot
ggplot
# ggplot
data_tbl <- cohort_tbl %>%
#
left_join(cohort_dict)
data_tbl %>%
mutate(cohort_group = forcats::fct_reorder(cohort, week_start, .desc = TRUE)) %>%
ggplot(mapping = aes(x = rel_week, y = cohort_group, fill = rate)) +
geom_tile() +
geom_text(aes(label = N), colour = "darkgray") +
labs(x = " ",
y = " ",
fill = "\n",
title = "graph_title") +
scale_fill_viridis_c(option = "inferno") +
scale_x_continuous(breaks = scales::breaks_width(1)) +
theme_minimal() +
theme(panel.grid = element_blank())
2. gt
, .
gt
# -
data_tbl <- cohort_tbl %>%
pivot_longer(cols = c(N, rate)) %>%
pivot_wider(names_from = rel_week, values_from = value) %>%
#
left_join(cohort_dict) %>%
arrange(week_start, desc(name))
odd_rows <- seq(1, to = nrow(data_tbl), by = 2)
even_rows <- seq(2, to = nrow(data_tbl), by = 2)
tab <- data_tbl %>%
mutate(cohort = if_else(rep(c(TRUE, FALSE), length.out = nrow(.)),
cohort, "")) %>%
select(-name, -week_start) %>%
gt(rowname_col = "cohort") %>%
fmt_percent(columns = matches("[0-9]+"),
rows = odd_rows,
decimals = 0, pattern = "<big>{x}</big>") %>%
fmt_missing(columns = everything(),
missing_text = "---") %>%
tab_stubhead(label = " ") %>%
tab_spanner(label = " ",
columns = everything()) %>%
tab_header(title = "") %>%
data_color(columns = everything(),
colors = scales::col_numeric(palette = "inferno",
domain = c(0, 1),
alpha = 0.6,
na.color = "lightgray")) %>%
tab_options(
table.font.size = "smaller",
data_row.padding = px(1),
table.width = pct(75)
) %>%
tab_style(
style = list(
cell_fill(color = "white"),
cell_text(style = "italic"),
cell_borders(sides = "bottom")
),
locations = cells_body(
columns = everything(),
rows = even_rows)
) %>%
tab_style(
style = list(
cell_borders(sides = "top")
),
locations = cells_body(
columns = everything(),
rows = odd_rows)
)
tab
, .
以前の出版物- 「Rと時間の作業。舞台裏は何ですか?」..。