dcast Function for data.table in R (2 Examples)
In this R tutorial you’ll learn how to apply the dcast function to a data.table, to calculate different statistics and reshape your data.
Preparing the Examples
Install and load the data.table package.
install.packages("data.table") # Install & load data.table package library("data.table") |
install.packages("data.table") # Install & load data.table package library("data.table")
For the illustration, we use the iris dataset.
data(iris) # Load iris data set iris_DT <- data.table::copy(iris) # Replicate iris data set setDT(iris_DT) # Convert iris to a data.table head(iris_DT) # Print the head of the data # Sepal.Length Sepal.Width Petal.Length Petal.Width Species # 1: 5.1 3.5 1.4 0.2 setosa # 2: 4.9 3.0 1.4 0.2 setosa # 3: 4.7 3.2 1.3 0.2 setosa # 4: 4.6 3.1 1.5 0.2 setosa # 5: 5.0 3.6 1.4 0.2 setosa # 6: 5.4 3.9 1.7 0.4 setosa |
data(iris) # Load iris data set iris_DT <- data.table::copy(iris) # Replicate iris data set setDT(iris_DT) # Convert iris to a data.table head(iris_DT) # Print the head of the data # Sepal.Length Sepal.Width Petal.Length Petal.Width Species # 1: 5.1 3.5 1.4 0.2 setosa # 2: 4.9 3.0 1.4 0.2 setosa # 3: 4.7 3.2 1.3 0.2 setosa # 4: 4.6 3.1 1.5 0.2 setosa # 5: 5.0 3.6 1.4 0.2 setosa # 6: 5.4 3.9 1.7 0.4 setosa
We transform the data to a data.table. Furthermore, we transform variables Sepal.Length and Sepal.Width from numerical variables into factor variables using the cut() function.
iris_DT_2 <- data.table::copy(iris_DT) # Replicate iris data set iris_DT_2[ , Sepal.Length := cut(Sepal.Length, quantile(Sepal.Length), include.lowest = TRUE)] iris_DT_2[ , Sepal.Width := cut(Sepal.Width, quantile(Sepal.Width), include.lowest = TRUE)] head(iris_DT_2) # Sepal.Length Sepal.Width Petal.Length Petal.Width Species # 1: [4.3,5.1] (3.3,4.4] 1.4 0.2 setosa # 2: [4.3,5.1] (2.8,3] 1.4 0.2 setosa # 3: [4.3,5.1] (3,3.3] 1.3 0.2 setosa # 4: [4.3,5.1] (3,3.3] 1.5 0.2 setosa # 5: [4.3,5.1] (3.3,4.4] 1.4 0.2 setosa # 6: (5.1,5.8] (3.3,4.4] 1.7 0.4 setosa |
iris_DT_2 <- data.table::copy(iris_DT) # Replicate iris data set iris_DT_2[ , Sepal.Length := cut(Sepal.Length, quantile(Sepal.Length), include.lowest = TRUE)] iris_DT_2[ , Sepal.Width := cut(Sepal.Width, quantile(Sepal.Width), include.lowest = TRUE)] head(iris_DT_2) # Sepal.Length Sepal.Width Petal.Length Petal.Width Species # 1: [4.3,5.1] (3.3,4.4] 1.4 0.2 setosa # 2: [4.3,5.1] (2.8,3] 1.4 0.2 setosa # 3: [4.3,5.1] (3,3.3] 1.3 0.2 setosa # 4: [4.3,5.1] (3,3.3] 1.5 0.2 setosa # 5: [4.3,5.1] (3.3,4.4] 1.4 0.2 setosa # 6: (5.1,5.8] (3.3,4.4] 1.7 0.4 setosa
Example 1: Calculate Group Means and Sums
In this example, we use dcast() to calculate the mean and the sum of variables Petal.Length and Petal.Width for the combinations of variables Sepal.Length, Sepal.Width, and Species.
iris_DT_3 <- dcast(iris_DT_2, Sepal.Length + Sepal.Width + Species ~ ., fun.aggregate = list(mean, sum), value.var = c("Petal.Length", "Petal.Width")) head(iris_DT_3) # Sepal.Length Sepal.Width Species Petal.Length_mean Petal.Width_mean Petal.Length_sum Petal.Width_sum # 1: [4.3,5.1] [2,2.8] setosa 1.300000 0.3000000 1.3 0.3 # 2: [4.3,5.1] [2,2.8] versicolor 3.275000 1.0250000 13.1 4.1 # 3: [4.3,5.1] [2,2.8] virginica 4.500000 1.7000000 4.5 1.7 # 4: [4.3,5.1] (2.8,3] setosa 1.371429 0.1857143 9.6 1.3 # 5: [4.3,5.1] (3,3.3] setosa 1.454545 0.2181818 16.0 2.4 # 6: [4.3,5.1] (3.3,4.4] setosa 1.500000 0.2764706 25.5 4.7 |
iris_DT_3 <- dcast(iris_DT_2, Sepal.Length + Sepal.Width + Species ~ ., fun.aggregate = list(mean, sum), value.var = c("Petal.Length", "Petal.Width")) head(iris_DT_3) # Sepal.Length Sepal.Width Species Petal.Length_mean Petal.Width_mean Petal.Length_sum Petal.Width_sum # 1: [4.3,5.1] [2,2.8] setosa 1.300000 0.3000000 1.3 0.3 # 2: [4.3,5.1] [2,2.8] versicolor 3.275000 1.0250000 13.1 4.1 # 3: [4.3,5.1] [2,2.8] virginica 4.500000 1.7000000 4.5 1.7 # 4: [4.3,5.1] (2.8,3] setosa 1.371429 0.1857143 9.6 1.3 # 5: [4.3,5.1] (3,3.3] setosa 1.454545 0.2181818 16.0 2.4 # 6: [4.3,5.1] (3.3,4.4] setosa 1.500000 0.2764706 25.5 4.7
Example 2: Grouping Structure
Function dcast() can also be used to see the grouping structure of the data. For that, we simply count the number of observations for the combinations of variables Sepal.Length, Sepal.Width, and Species with aggregation function length.
iris_DT_4 <- dcast(iris_DT_2, Sepal.Length + Sepal.Width + Species ~ ., fun.aggregate = length, value.var = c("Petal.Length", "Petal.Width")) head(iris_DT_4) # Sepal.Length Sepal.Width Species Petal.Length Petal.Width # 1: [4.3,5.1] [2,2.8] setosa 1 1 # 2: [4.3,5.1] [2,2.8] versicolor 4 4 # 3: [4.3,5.1] [2,2.8] virginica 1 1 # 4: [4.3,5.1] (2.8,3] setosa 7 7 # 5: [4.3,5.1] (3,3.3] setosa 11 11 # 6: [4.3,5.1] (3.3,4.4] setosa 17 17 |
iris_DT_4 <- dcast(iris_DT_2, Sepal.Length + Sepal.Width + Species ~ ., fun.aggregate = length, value.var = c("Petal.Length", "Petal.Width")) head(iris_DT_4) # Sepal.Length Sepal.Width Species Petal.Length Petal.Width # 1: [4.3,5.1] [2,2.8] setosa 1 1 # 2: [4.3,5.1] [2,2.8] versicolor 4 4 # 3: [4.3,5.1] [2,2.8] virginica 1 1 # 4: [4.3,5.1] (2.8,3] setosa 7 7 # 5: [4.3,5.1] (3,3.3] setosa 11 11 # 6: [4.3,5.1] (3.3,4.4] setosa 17 17
Note: This article was created in collaboration with Anna-Lena Wölwer. Anna-Lena is a researcher and programmer who creates tutorials on statistical methodology as well as on the R programming language. You may find more info about Anna-Lena and her other articles on her profile page.