Splitting Data into Training & Testing Sets in R (Example Code)
In this article you’ll learn how to divide a data frame into training and testing data sets in the R programming language.
Creating Example Data
set.seed(1043857) # Creating example data df_all <- data.frame(x = round(runif(100), 2), y = round(runif(100), 2), z = sample(letters, 100, replace = TRUE)) head(df_all) # Head of example data # x y z # 1 0.83 0.67 g # 2 0.17 0.88 x # 3 0.88 0.21 m # 4 0.15 0.43 h # 5 0.74 0.29 r # 6 0.80 0.11 k |
set.seed(1043857) # Creating example data df_all <- data.frame(x = round(runif(100), 2), y = round(runif(100), 2), z = sample(letters, 100, replace = TRUE)) head(df_all) # Head of example data # x y z # 1 0.83 0.67 g # 2 0.17 0.88 x # 3 0.88 0.21 m # 4 0.15 0.43 h # 5 0.74 0.29 r # 6 0.80 0.11 k
Example: Creating Train & Test Data with sample Function
ind_split <- sample(c(rep("train", 0.75 * nrow(df_all)), # Create dummy for splitting rep("test", 0.25 * nrow(df_all)))) # 75% train; 25% test data |
ind_split <- sample(c(rep("train", 0.75 * nrow(df_all)), # Create dummy for splitting rep("test", 0.25 * nrow(df_all)))) # 75% train; 25% test data
df_train <- df_all[ind_split == "train", ] # Creating train data set head(df_train) # Head of train data # x y z # 1 0.83 0.67 g # 2 0.17 0.88 x # 3 0.88 0.21 m # 4 0.15 0.43 h # 5 0.74 0.29 r # 6 0.80 0.11 k |
df_train <- df_all[ind_split == "train", ] # Creating train data set head(df_train) # Head of train data # x y z # 1 0.83 0.67 g # 2 0.17 0.88 x # 3 0.88 0.21 m # 4 0.15 0.43 h # 5 0.74 0.29 r # 6 0.80 0.11 k
df_test <- df_all[ind_split == "test", ] # Creating test data set head(df_test) # Head of test data # x y z # 7 0.86 0.58 q # 12 0.31 0.24 f # 17 0.43 0.89 i # 20 0.64 0.03 b # 26 0.30 0.48 o # 27 0.83 0.04 c |
df_test <- df_all[ind_split == "test", ] # Creating test data set head(df_test) # Head of test data # x y z # 7 0.86 0.58 q # 12 0.31 0.24 f # 17 0.43 0.89 i # 20 0.64 0.03 b # 26 0.30 0.48 o # 27 0.83 0.04 c