- 08.00 - 08.30 Recap of exercises from last class
- 08.30 - 09.00 Introduction to modelling, dimension reduction and clustering
- 09.00 - 12.00 Exercises in Modelling, dimension reduction and clustering
March 2nd 2020
mk_dna <- function(n){ my_dna <- c("A", "C", "G", "T") %>% sample(size = n, replace = TRUE) %>% str_c(collapse = "") return(my_dna) }
my_dna <- mk_dna(n = 100) my_dna
## [1] "CAGTAACGAAACTTATATTTTGGCGATGTTTAAACTCTTTAATGGGTGTGGGACCCGACTGGGATATCCTGACAACCTGCGGGAACCCGGCCCGGGTGCA"
my_dna %>% str_count("A")/100
## [1] 0.24
my_dna %>% str_replace_all("T", "U")
## [1] "CAGUAACGAAACUUAUAUUUUGGCGAUGUUUAAACUCUUUAAUGGGUGUGGGACCCGACUGGGAUAUCCUGACAACCUGCGGGAACCCGGCCCGGGUGCA"
my_dna %>% str_replace_all("T", "U") %>% str_count("AUG")
## [1] 2
my_dna %>% str_replace_all("T", "U") %>% str_locate_all("AUG")
## [[1]] ## start end ## [1,] 26 28 ## [2,] 42 44
factor(LETTERS)
## [1] A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
factor(rep(LETTERS, 10))
## [1] A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I ## [36] J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R ## [71] S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A ## [106] B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J ## [141] K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S ## [176] T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B ## [211] C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K ## [246] L M N O P Q R S T U V W X Y Z ## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
factor(rev(LETTERS))
## [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A ## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
factor(rev(LETTERS), levels = LETTERS)
## [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A ## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
factor(rev(LETTERS), levels = rev(LETTERS))
## [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A ## Levels: Z Y X W V U T S R Q P O N M L K J I H G F E D C B A
my_files <- list.files(path = "data/diabetes_csv_files", full.names = TRUE) my_files
## [1] "data/diabetes_csv_files/04_diabetes_id_age.csv" ## [2] "data/diabetes_csv_files/04_diabetes_id_bp.1d.csv" ## [3] "data/diabetes_csv_files/04_diabetes_id_bp.1s.csv" ## [4] "data/diabetes_csv_files/04_diabetes_id_bp.2d.csv" ## [5] "data/diabetes_csv_files/04_diabetes_id_bp.2s.csv" ## [6] "data/diabetes_csv_files/04_diabetes_id_chol.csv" ## [7] "data/diabetes_csv_files/04_diabetes_id_frame.csv" ## [8] "data/diabetes_csv_files/04_diabetes_id_gender.csv" ## [9] "data/diabetes_csv_files/04_diabetes_id_glyhb.csv" ## [10] "data/diabetes_csv_files/04_diabetes_id_hdl.csv" ## [11] "data/diabetes_csv_files/04_diabetes_id_height.csv" ## [12] "data/diabetes_csv_files/04_diabetes_id_hip.csv" ## [13] "data/diabetes_csv_files/04_diabetes_id_location.csv" ## [14] "data/diabetes_csv_files/04_diabetes_id_ratio.csv" ## [15] "data/diabetes_csv_files/04_diabetes_id_stab.glu.csv" ## [16] "data/diabetes_csv_files/04_diabetes_id_time.ppn.csv" ## [17] "data/diabetes_csv_files/04_diabetes_id_waist.csv" ## [18] "data/diabetes_csv_files/04_diabetes_id_weight.csv"
X <- my_files %>% setNames(nm = .) %>% map(read_csv) X
## $`data/diabetes_csv_files/04_diabetes_id_age.csv` ## # A tibble: 403 x 2 ## id age ## <dbl> <dbl> ## 1 1000 46 ## 2 1001 29 ## 3 1002 58 ## 4 1003 67 ## 5 1005 64 ## 6 1008 34 ## 7 1011 30 ## 8 1015 37 ## 9 1016 45 ## 10 1022 55 ## # … with 393 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.1d.csv` ## # A tibble: 398 x 2 ## id bp.1d ## <dbl> <dbl> ## 1 1000 59 ## 2 1001 68 ## 3 1002 92 ## 4 1003 50 ## 5 1005 80 ## 6 1008 86 ## 7 1011 112 ## 8 1016 80 ## 9 1022 72 ## 10 1024 90 ## # … with 388 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.1s.csv` ## # A tibble: 398 x 2 ## id bp.1s ## <dbl> <dbl> ## 1 1000 118 ## 2 1001 112 ## 3 1002 190 ## 4 1003 110 ## 5 1005 138 ## 6 1008 132 ## 7 1011 161 ## 8 1016 160 ## 9 1022 108 ## 10 1024 130 ## # … with 388 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.2d.csv` ## # A tibble: 141 x 2 ## id bp.2d ## <dbl> <dbl> ## 1 1002 92 ## 2 1011 112 ## 3 1016 86 ## 4 1024 90 ## 5 1036 96 ## 6 1252 84 ## 7 1253 110 ## 8 1256 88 ## 9 1271 70 ## 10 1285 112 ## # … with 131 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.2s.csv` ## # A tibble: 141 x 2 ## id bp.2s ## <dbl> <dbl> ## 1 1002 185 ## 2 1011 161 ## 3 1016 128 ## 4 1024 130 ## 5 1036 120 ## 6 1252 148 ## 7 1253 149 ## 8 1256 160 ## 9 1271 110 ## 10 1285 170 ## # … with 131 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_chol.csv` ## # A tibble: 402 x 2 ## id chol ## <dbl> <dbl> ## 1 1000 203 ## 2 1001 165 ## 3 1002 228 ## 4 1003 78 ## 5 1005 249 ## 6 1008 248 ## 7 1011 195 ## 8 1015 227 ## 9 1016 177 ## 10 1022 263 ## # … with 392 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_frame.csv` ## # A tibble: 391 x 2 ## id frame ## <dbl> <chr> ## 1 1000 medium ## 2 1001 large ## 3 1002 large ## 4 1003 large ## 5 1005 medium ## 6 1008 large ## 7 1011 medium ## 8 1015 medium ## 9 1016 large ## 10 1022 small ## # … with 381 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_gender.csv` ## # A tibble: 403 x 2 ## id gender ## <dbl> <chr> ## 1 1000 female ## 2 1001 female ## 3 1002 female ## 4 1003 male ## 5 1005 male ## 6 1008 male ## 7 1011 male ## 8 1015 male ## 9 1016 male ## 10 1022 female ## # … with 393 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_glyhb.csv` ## # A tibble: 390 x 2 ## id glyhb ## <dbl> <dbl> ## 1 1000 4.31 ## 2 1001 4.44 ## 3 1002 4.64 ## 4 1003 4.63 ## 5 1005 7.72 ## 6 1008 4.81 ## 7 1011 4.84 ## 8 1015 3.94 ## 9 1016 4.84 ## 10 1022 5.78 ## # … with 380 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_hdl.csv` ## # A tibble: 402 x 2 ## id hdl ## <dbl> <dbl> ## 1 1000 56 ## 2 1001 24 ## 3 1002 37 ## 4 1003 12 ## 5 1005 28 ## 6 1008 69 ## 7 1011 41 ## 8 1015 44 ## 9 1016 49 ## 10 1022 40 ## # … with 392 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_height.csv` ## # A tibble: 398 x 2 ## id height ## <dbl> <dbl> ## 1 1000 62 ## 2 1001 64 ## 3 1002 61 ## 4 1003 67 ## 5 1005 68 ## 6 1008 71 ## 7 1011 69 ## 8 1015 59 ## 9 1016 69 ## 10 1022 63 ## # … with 388 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_hip.csv` ## # A tibble: 401 x 2 ## id hip ## <dbl> <dbl> ## 1 1000 38 ## 2 1001 48 ## 3 1002 57 ## 4 1003 38 ## 5 1005 41 ## 6 1008 42 ## 7 1011 49 ## 8 1015 39 ## 9 1016 40 ## 10 1022 50 ## # … with 391 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_location.csv` ## # A tibble: 403 x 2 ## id location ## <dbl> <chr> ## 1 1000 Buckingham ## 2 1001 Buckingham ## 3 1002 Buckingham ## 4 1003 Buckingham ## 5 1005 Buckingham ## 6 1008 Buckingham ## 7 1011 Buckingham ## 8 1015 Buckingham ## 9 1016 Buckingham ## 10 1022 Buckingham ## # … with 393 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_ratio.csv` ## # A tibble: 402 x 2 ## id ratio ## <dbl> <dbl> ## 1 1000 3.60 ## 2 1001 6.90 ## 3 1002 6.20 ## 4 1003 6.5 ## 5 1005 8.90 ## 6 1008 3.60 ## 7 1011 4.80 ## 8 1015 5.20 ## 9 1016 3.60 ## 10 1022 6.60 ## # … with 392 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_stab.glu.csv` ## # A tibble: 403 x 2 ## id stab.glu ## <dbl> <dbl> ## 1 1000 82 ## 2 1001 97 ## 3 1002 92 ## 4 1003 93 ## 5 1005 90 ## 6 1008 94 ## 7 1011 92 ## 8 1015 75 ## 9 1016 87 ## 10 1022 89 ## # … with 393 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_time.ppn.csv` ## # A tibble: 400 x 2 ## id time.ppn ## <dbl> <dbl> ## 1 1000 720 ## 2 1001 360 ## 3 1002 180 ## 4 1003 480 ## 5 1005 300 ## 6 1008 195 ## 7 1011 720 ## 8 1015 1020 ## 9 1016 300 ## 10 1022 240 ## # … with 390 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_waist.csv` ## # A tibble: 401 x 2 ## id waist ## <dbl> <dbl> ## 1 1000 29 ## 2 1001 46 ## 3 1002 49 ## 4 1003 33 ## 5 1005 44 ## 6 1008 36 ## 7 1011 46 ## 8 1015 34 ## 9 1016 34 ## 10 1022 45 ## # … with 391 more rows ## ## $`data/diabetes_csv_files/04_diabetes_id_weight.csv` ## # A tibble: 402 x 2 ## id weight ## <dbl> <dbl> ## 1 1000 121 ## 2 1001 218 ## 3 1002 256 ## 4 1003 119 ## 5 1005 183 ## 6 1008 190 ## 7 1011 191 ## 8 1015 170 ## 9 1016 166 ## 10 1022 202 ## # … with 392 more rows
X %>% map(dim)
## $`data/diabetes_csv_files/04_diabetes_id_age.csv` ## [1] 403 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.1d.csv` ## [1] 398 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.1s.csv` ## [1] 398 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.2d.csv` ## [1] 141 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_bp.2s.csv` ## [1] 141 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_chol.csv` ## [1] 402 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_frame.csv` ## [1] 391 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_gender.csv` ## [1] 403 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_glyhb.csv` ## [1] 390 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_hdl.csv` ## [1] 402 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_height.csv` ## [1] 398 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_hip.csv` ## [1] 401 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_location.csv` ## [1] 403 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_ratio.csv` ## [1] 402 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_stab.glu.csv` ## [1] 403 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_time.ppn.csv` ## [1] 400 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_waist.csv` ## [1] 401 2 ## ## $`data/diabetes_csv_files/04_diabetes_id_weight.csv` ## [1] 402 2
diabetes_data <- X %>% reduce(full_join, by = "id") diabetes_data
## # A tibble: 403 x 19 ## id age bp.1d bp.1s bp.2d bp.2s chol frame gender glyhb hdl ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> ## 1 1000 46 59 118 NA NA 203 medi… female 4.31 56 ## 2 1001 29 68 112 NA NA 165 large female 4.44 24 ## 3 1002 58 92 190 92 185 228 large female 4.64 37 ## 4 1003 67 50 110 NA NA 78 large male 4.63 12 ## 5 1005 64 80 138 NA NA 249 medi… male 7.72 28 ## 6 1008 34 86 132 NA NA 248 large male 4.81 69 ## 7 1011 30 112 161 112 161 195 medi… male 4.84 41 ## 8 1015 37 NA NA NA NA 227 medi… male 3.94 44 ## 9 1016 45 80 160 86 128 177 large male 4.84 49 ## 10 1022 55 72 108 NA NA 263 small female 5.78 40 ## # … with 393 more rows, and 8 more variables: height <dbl>, hip <dbl>, ## # location <chr>, ratio <dbl>, stab.glu <dbl>, time.ppn <dbl>, ## # waist <dbl>, weight <dbl>
"data/diabetes_csv_files" %>% list.files(pattern = "csv$", full.names = TRUE) %>% setNames(nm = .) %>% map(read_csv) %>% reduce(full_join, by = "id") %>% select(id, chol, stab.glu, hdl, ratio, glyhb, location, age, gender, height, weight, frame, bp.1s, bp.1d, bp.2s, bp.2d, waist, hip, time.ppn)
## # A tibble: 403 x 19 ## id chol stab.glu hdl ratio glyhb location age gender height ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> ## 1 1000 203 82 56 3.60 4.31 Bucking… 46 female 62 ## 2 1001 165 97 24 6.90 4.44 Bucking… 29 female 64 ## 3 1002 228 92 37 6.20 4.64 Bucking… 58 female 61 ## 4 1003 78 93 12 6.5 4.63 Bucking… 67 male 67 ## 5 1005 249 90 28 8.90 7.72 Bucking… 64 male 68 ## 6 1008 248 94 69 3.60 4.81 Bucking… 34 male 71 ## 7 1011 195 92 41 4.80 4.84 Bucking… 30 male 69 ## 8 1015 227 75 44 5.20 3.94 Bucking… 37 male 59 ## 9 1016 177 87 49 3.60 4.84 Bucking… 45 male 69 ## 10 1022 263 89 40 6.60 5.78 Bucking… 55 female 63 ## # … with 393 more rows, and 9 more variables: weight <dbl>, frame <chr>, ## # bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>, bp.2d <dbl>, waist <dbl>, ## # hip <dbl>, time.ppn <dbl>
read_csv(file = "data/diabetes.csv")
## # A tibble: 403 x 19 ## id chol stab.glu hdl ratio glyhb location age gender height ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> ## 1 1000 203 82 56 3.60 4.31 Bucking… 46 female 62 ## 2 1001 165 97 24 6.90 4.44 Bucking… 29 female 64 ## 3 1002 228 92 37 6.20 4.64 Bucking… 58 female 61 ## 4 1003 78 93 12 6.5 4.63 Bucking… 67 male 67 ## 5 1005 249 90 28 8.90 7.72 Bucking… 64 male 68 ## 6 1008 248 94 69 3.60 4.81 Bucking… 34 male 71 ## 7 1011 195 92 41 4.80 4.84 Bucking… 30 male 69 ## 8 1015 227 75 44 5.20 3.94 Bucking… 37 male 59 ## 9 1016 177 87 49 3.60 4.84 Bucking… 45 male 69 ## 10 1022 263 89 40 6.60 5.78 Bucking… 55 female 63 ## # … with 393 more rows, and 9 more variables: weight <dbl>, frame <chr>, ## # bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>, bp.2d <dbl>, waist <dbl>, ## # hip <dbl>, time.ppn <dbl>
diabetes_data_long = diabetes_data %>% select_if(is.numeric) %>% pivot_longer(cols = -c("id"), names_to = "vars", values_to = "value")
diabetes_data_long <- diabetes_data %>% mutate(gender_bin = case_when(gender == "female" ~ 1, gender == "male" ~ 0)) %>% select_if(is.numeric) %>% pivot_longer(cols = -c("id", "gender_bin"), names_to = "vars", values_to = "value") %>% mutate(id = factor(id), gender_bin = factor(gender_bin)) diabetes_data_long
## # A tibble: 6,045 x 4 ## id gender_bin vars value ## <fct> <fct> <chr> <dbl> ## 1 1000 1 age 46 ## 2 1000 1 bp.1d 59 ## 3 1000 1 bp.1s 118 ## 4 1000 1 bp.2d NA ## 5 1000 1 bp.2s NA ## 6 1000 1 chol 203 ## 7 1000 1 glyhb 4.31 ## 8 1000 1 hdl 56 ## 9 1000 1 height 62 ## 10 1000 1 hip 38 ## # … with 6,035 more rows
diabetes_data_long %>% ggplot(aes(x = id, y = value, colour = gender_bin)) + geom_point() + facet_wrap(~vars, nrow = 5, scales = "free_y") + labs(x = "") + theme(axis.text.x = element_blank())