How to Count Distinct Values by Group in R (3 Examples)
In this R post you’ll learn how to get the number of distinct values in each group of a data frame.
Example Data
set.seed(5643289) # Data with 2 grouping variables my_df <- data.frame(gr1 = sample(letters[1:5], 500, replace = TRUE), gr2 = sample(LETTERS, 500, replace = TRUE)) head(my_df) # Showing head of example data # gr1 gr2 # 1 b U # 2 b I # 3 b K # 4 e F # 5 d K # 6 d K |
set.seed(5643289) # Data with 2 grouping variables my_df <- data.frame(gr1 = sample(letters[1:5], 500, replace = TRUE), gr2 = sample(LETTERS, 500, replace = TRUE)) head(my_df) # Showing head of example data # gr1 gr2 # 1 b U # 2 b I # 3 b K # 4 e F # 5 d K # 6 d K
Example 1: Using Base R to Count Unique Values in Each Group
aggregate(data = my_df, # Use aggregate() function gr2 ~ gr1, function(x) length(unique(x))) # gr1 gr2 # 1 a 25 # 2 b 24 # 3 c 26 # 4 d 26 # 5 e 25 |
aggregate(data = my_df, # Use aggregate() function gr2 ~ gr1, function(x) length(unique(x))) # gr1 gr2 # 1 a 25 # 2 b 24 # 3 c 26 # 4 d 26 # 5 e 25
Example 2: Using data.table Package to Count Unique Values in Each Group
install.packages("data.table") # Install & load data.table package library("data.table") |
install.packages("data.table") # Install & load data.table package library("data.table")
data.table(my_df)[ , .(count = length(unique(gr2))), by = gr1] # Use data.table package # gr1 count # 1: b 24 # 2: e 25 # 3: d 26 # 4: c 26 # 5: a 25 |
data.table(my_df)[ , .(count = length(unique(gr2))), by = gr1] # Use data.table package # gr1 count # 1: b 24 # 2: e 25 # 3: d 26 # 4: c 26 # 5: a 25
Example 3: Using dplyr Package to Count Unique Values in Each Group
install.packages("dplyr") # Install dplyr package library("dplyr") # Load dplyr |
install.packages("dplyr") # Install dplyr package library("dplyr") # Load dplyr
my_df %>% # Use dplyr package group_by(gr1) %>% summarise(count = n_distinct(gr2)) # # A tibble: 5 x 2 # gr1 count # <fct> <int> # 1 a 25 # 2 b 24 # 3 c 26 # 4 d 26 # 5 e 25 |
my_df %>% # Use dplyr package group_by(gr1) %>% summarise(count = n_distinct(gr2)) # # A tibble: 5 x 2 # gr1 count # <fct> <int> # 1 a 25 # 2 b 24 # 3 c 26 # 4 d 26 # 5 e 25