22100 - R for Bio Data Science

March 2nd 2020

Agenda

08.00 - 08.30 Recap of exercises from last class
08.30 - 09.00 Introduction to modelling, dimension reduction and clustering
09.00 - 12.00 Exercises in Modelling, dimension reduction and clustering

First of all

The Components of a plot

Joins

Pivotting

Exercises: Data Manipulation II

Q4

Use the base function sample() in conjugation with str_c() to create a function, which can return a random dna string of length n, run the function with n = 100 and save the output to my_dna - What fraction of the dna you created is adenine?

mk_dna <- function(n){
  my_dna <- c("A", "C", "G", "T") %>% 
    sample(size = n, replace = TRUE) %>% 
    str_c(collapse = "")
  return(my_dna)
}

Q4

Use the base function sample() in conjugation with str_c() to create a function, which can return a random dna string of length n, run the function with n = 100 and save the output to my_dna - What fraction of the dna you created is adenine?

my_dna <- mk_dna(n = 100)
my_dna

## [1] "CAGTAACGAAACTTATATTTTGGCGATGTTTAAACTCTTTAATGGGTGTGGGACCCGACTGGGATATCCTGACAACCTGCGGGAACCCGGCCCGGGTGCA"

my_dna %>% str_count("A")/100

## [1] 0.24

Q5

Use the approproate str_* function to change my_dna to my_rna - How many start codons did you get?

my_dna %>% 
  str_replace_all("T", "U")

## [1] "CAGUAACGAAACUUAUAUUUUGGCGAUGUUUAAACUCUUUAAUGGGUGUGGGACCCGACUGGGAUAUCCUGACAACCUGCGGGAACCCGGCCCGGGUGCA"

my_dna %>% 
  str_replace_all("T", "U") %>% 
  str_count("AUG")

## [1] 2

my_dna %>% 
  str_replace_all("T", "U") %>% 
  str_locate_all("AUG")

## [[1]]
##      start end
## [1,]    26  28
## [2,]    42  44

Q9

factor(LETTERS)

##  [1] A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

factor(rep(LETTERS, 10))

##   [1] A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I
##  [36] J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R
##  [71] S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A
## [106] B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J
## [141] K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S
## [176] T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B
## [211] C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K
## [246] L M N O P Q R S T U V W X Y Z
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

Q10

factor(rev(LETTERS))

##  [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

factor(rev(LETTERS), levels = LETTERS)

##  [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A
## Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

factor(rev(LETTERS), levels = rev(LETTERS))

##  [1] Z Y X W V U T S R Q P O N M L K J I H G F E D C B A
## Levels: Z Y X W V U T S R Q P O N M L K J I H G F E D C B A

Q13 Recreate the diabetes data by multiple joins

my_files <- list.files(path = "data/diabetes_csv_files", full.names = TRUE)
my_files

##  [1] "data/diabetes_csv_files/04_diabetes_id_age.csv"     
##  [2] "data/diabetes_csv_files/04_diabetes_id_bp.1d.csv"   
##  [3] "data/diabetes_csv_files/04_diabetes_id_bp.1s.csv"   
##  [4] "data/diabetes_csv_files/04_diabetes_id_bp.2d.csv"   
##  [5] "data/diabetes_csv_files/04_diabetes_id_bp.2s.csv"   
##  [6] "data/diabetes_csv_files/04_diabetes_id_chol.csv"    
##  [7] "data/diabetes_csv_files/04_diabetes_id_frame.csv"   
##  [8] "data/diabetes_csv_files/04_diabetes_id_gender.csv"  
##  [9] "data/diabetes_csv_files/04_diabetes_id_glyhb.csv"   
## [10] "data/diabetes_csv_files/04_diabetes_id_hdl.csv"     
## [11] "data/diabetes_csv_files/04_diabetes_id_height.csv"  
## [12] "data/diabetes_csv_files/04_diabetes_id_hip.csv"     
## [13] "data/diabetes_csv_files/04_diabetes_id_location.csv"
## [14] "data/diabetes_csv_files/04_diabetes_id_ratio.csv"   
## [15] "data/diabetes_csv_files/04_diabetes_id_stab.glu.csv"
## [16] "data/diabetes_csv_files/04_diabetes_id_time.ppn.csv"
## [17] "data/diabetes_csv_files/04_diabetes_id_waist.csv"   
## [18] "data/diabetes_csv_files/04_diabetes_id_weight.csv"

Q13 Recreate the diabetes data by multiple joins

X <- my_files %>%
  setNames(nm = .) %>%
  map(read_csv)
X

## $`data/diabetes_csv_files/04_diabetes_id_age.csv`
## # A tibble: 403 x 2
##       id   age
##    <dbl> <dbl>
##  1  1000    46
##  2  1001    29
##  3  1002    58
##  4  1003    67
##  5  1005    64
##  6  1008    34
##  7  1011    30
##  8  1015    37
##  9  1016    45
## 10  1022    55
## # … with 393 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.1d.csv`
## # A tibble: 398 x 2
##       id bp.1d
##    <dbl> <dbl>
##  1  1000    59
##  2  1001    68
##  3  1002    92
##  4  1003    50
##  5  1005    80
##  6  1008    86
##  7  1011   112
##  8  1016    80
##  9  1022    72
## 10  1024    90
## # … with 388 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.1s.csv`
## # A tibble: 398 x 2
##       id bp.1s
##    <dbl> <dbl>
##  1  1000   118
##  2  1001   112
##  3  1002   190
##  4  1003   110
##  5  1005   138
##  6  1008   132
##  7  1011   161
##  8  1016   160
##  9  1022   108
## 10  1024   130
## # … with 388 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.2d.csv`
## # A tibble: 141 x 2
##       id bp.2d
##    <dbl> <dbl>
##  1  1002    92
##  2  1011   112
##  3  1016    86
##  4  1024    90
##  5  1036    96
##  6  1252    84
##  7  1253   110
##  8  1256    88
##  9  1271    70
## 10  1285   112
## # … with 131 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.2s.csv`
## # A tibble: 141 x 2
##       id bp.2s
##    <dbl> <dbl>
##  1  1002   185
##  2  1011   161
##  3  1016   128
##  4  1024   130
##  5  1036   120
##  6  1252   148
##  7  1253   149
##  8  1256   160
##  9  1271   110
## 10  1285   170
## # … with 131 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_chol.csv`
## # A tibble: 402 x 2
##       id  chol
##    <dbl> <dbl>
##  1  1000   203
##  2  1001   165
##  3  1002   228
##  4  1003    78
##  5  1005   249
##  6  1008   248
##  7  1011   195
##  8  1015   227
##  9  1016   177
## 10  1022   263
## # … with 392 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_frame.csv`
## # A tibble: 391 x 2
##       id frame 
##    <dbl> <chr> 
##  1  1000 medium
##  2  1001 large 
##  3  1002 large 
##  4  1003 large 
##  5  1005 medium
##  6  1008 large 
##  7  1011 medium
##  8  1015 medium
##  9  1016 large 
## 10  1022 small 
## # … with 381 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_gender.csv`
## # A tibble: 403 x 2
##       id gender
##    <dbl> <chr> 
##  1  1000 female
##  2  1001 female
##  3  1002 female
##  4  1003 male  
##  5  1005 male  
##  6  1008 male  
##  7  1011 male  
##  8  1015 male  
##  9  1016 male  
## 10  1022 female
## # … with 393 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_glyhb.csv`
## # A tibble: 390 x 2
##       id glyhb
##    <dbl> <dbl>
##  1  1000  4.31
##  2  1001  4.44
##  3  1002  4.64
##  4  1003  4.63
##  5  1005  7.72
##  6  1008  4.81
##  7  1011  4.84
##  8  1015  3.94
##  9  1016  4.84
## 10  1022  5.78
## # … with 380 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_hdl.csv`
## # A tibble: 402 x 2
##       id   hdl
##    <dbl> <dbl>
##  1  1000    56
##  2  1001    24
##  3  1002    37
##  4  1003    12
##  5  1005    28
##  6  1008    69
##  7  1011    41
##  8  1015    44
##  9  1016    49
## 10  1022    40
## # … with 392 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_height.csv`
## # A tibble: 398 x 2
##       id height
##    <dbl>  <dbl>
##  1  1000     62
##  2  1001     64
##  3  1002     61
##  4  1003     67
##  5  1005     68
##  6  1008     71
##  7  1011     69
##  8  1015     59
##  9  1016     69
## 10  1022     63
## # … with 388 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_hip.csv`
## # A tibble: 401 x 2
##       id   hip
##    <dbl> <dbl>
##  1  1000    38
##  2  1001    48
##  3  1002    57
##  4  1003    38
##  5  1005    41
##  6  1008    42
##  7  1011    49
##  8  1015    39
##  9  1016    40
## 10  1022    50
## # … with 391 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_location.csv`
## # A tibble: 403 x 2
##       id location  
##    <dbl> <chr>     
##  1  1000 Buckingham
##  2  1001 Buckingham
##  3  1002 Buckingham
##  4  1003 Buckingham
##  5  1005 Buckingham
##  6  1008 Buckingham
##  7  1011 Buckingham
##  8  1015 Buckingham
##  9  1016 Buckingham
## 10  1022 Buckingham
## # … with 393 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_ratio.csv`
## # A tibble: 402 x 2
##       id ratio
##    <dbl> <dbl>
##  1  1000  3.60
##  2  1001  6.90
##  3  1002  6.20
##  4  1003  6.5 
##  5  1005  8.90
##  6  1008  3.60
##  7  1011  4.80
##  8  1015  5.20
##  9  1016  3.60
## 10  1022  6.60
## # … with 392 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_stab.glu.csv`
## # A tibble: 403 x 2
##       id stab.glu
##    <dbl>    <dbl>
##  1  1000       82
##  2  1001       97
##  3  1002       92
##  4  1003       93
##  5  1005       90
##  6  1008       94
##  7  1011       92
##  8  1015       75
##  9  1016       87
## 10  1022       89
## # … with 393 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_time.ppn.csv`
## # A tibble: 400 x 2
##       id time.ppn
##    <dbl>    <dbl>
##  1  1000      720
##  2  1001      360
##  3  1002      180
##  4  1003      480
##  5  1005      300
##  6  1008      195
##  7  1011      720
##  8  1015     1020
##  9  1016      300
## 10  1022      240
## # … with 390 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_waist.csv`
## # A tibble: 401 x 2
##       id waist
##    <dbl> <dbl>
##  1  1000    29
##  2  1001    46
##  3  1002    49
##  4  1003    33
##  5  1005    44
##  6  1008    36
##  7  1011    46
##  8  1015    34
##  9  1016    34
## 10  1022    45
## # … with 391 more rows
## 
## $`data/diabetes_csv_files/04_diabetes_id_weight.csv`
## # A tibble: 402 x 2
##       id weight
##    <dbl>  <dbl>
##  1  1000    121
##  2  1001    218
##  3  1002    256
##  4  1003    119
##  5  1005    183
##  6  1008    190
##  7  1011    191
##  8  1015    170
##  9  1016    166
## 10  1022    202
## # … with 392 more rows

Q13 Recreate the diabetes data by multiple joins

X %>% map(dim)

## $`data/diabetes_csv_files/04_diabetes_id_age.csv`
## [1] 403   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.1d.csv`
## [1] 398   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.1s.csv`
## [1] 398   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.2d.csv`
## [1] 141   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_bp.2s.csv`
## [1] 141   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_chol.csv`
## [1] 402   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_frame.csv`
## [1] 391   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_gender.csv`
## [1] 403   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_glyhb.csv`
## [1] 390   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_hdl.csv`
## [1] 402   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_height.csv`
## [1] 398   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_hip.csv`
## [1] 401   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_location.csv`
## [1] 403   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_ratio.csv`
## [1] 402   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_stab.glu.csv`
## [1] 403   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_time.ppn.csv`
## [1] 400   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_waist.csv`
## [1] 401   2
## 
## $`data/diabetes_csv_files/04_diabetes_id_weight.csv`
## [1] 402   2

Q13 Recreate the diabetes data by multiple joins

diabetes_data <- X %>%
  reduce(full_join, by = "id")
diabetes_data

## # A tibble: 403 x 19
##       id   age bp.1d bp.1s bp.2d bp.2s  chol frame gender glyhb   hdl
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>  <dbl> <dbl>
##  1  1000    46    59   118    NA    NA   203 medi… female  4.31    56
##  2  1001    29    68   112    NA    NA   165 large female  4.44    24
##  3  1002    58    92   190    92   185   228 large female  4.64    37
##  4  1003    67    50   110    NA    NA    78 large male    4.63    12
##  5  1005    64    80   138    NA    NA   249 medi… male    7.72    28
##  6  1008    34    86   132    NA    NA   248 large male    4.81    69
##  7  1011    30   112   161   112   161   195 medi… male    4.84    41
##  8  1015    37    NA    NA    NA    NA   227 medi… male    3.94    44
##  9  1016    45    80   160    86   128   177 large male    4.84    49
## 10  1022    55    72   108    NA    NA   263 small female  5.78    40
## # … with 393 more rows, and 8 more variables: height <dbl>, hip <dbl>,
## #   location <chr>, ratio <dbl>, stab.glu <dbl>, time.ppn <dbl>,
## #   waist <dbl>, weight <dbl>

Q13 Recreate the diabetes data by multiple joins

"data/diabetes_csv_files" %>%
  list.files(pattern = "csv$", full.names = TRUE) %>%
  setNames(nm = .) %>%
  map(read_csv) %>% 
  reduce(full_join, by = "id") %>% 
  select(id, chol, stab.glu, hdl, ratio, glyhb, location, age, gender, height,
         weight, frame, bp.1s, bp.1d, bp.2s, bp.2d, waist, hip, time.ppn)

## # A tibble: 403 x 19
##       id  chol stab.glu   hdl ratio glyhb location   age gender height
##    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>    <dbl> <chr>   <dbl>
##  1  1000   203       82    56  3.60  4.31 Bucking…    46 female     62
##  2  1001   165       97    24  6.90  4.44 Bucking…    29 female     64
##  3  1002   228       92    37  6.20  4.64 Bucking…    58 female     61
##  4  1003    78       93    12  6.5   4.63 Bucking…    67 male       67
##  5  1005   249       90    28  8.90  7.72 Bucking…    64 male       68
##  6  1008   248       94    69  3.60  4.81 Bucking…    34 male       71
##  7  1011   195       92    41  4.80  4.84 Bucking…    30 male       69
##  8  1015   227       75    44  5.20  3.94 Bucking…    37 male       59
##  9  1016   177       87    49  3.60  4.84 Bucking…    45 male       69
## 10  1022   263       89    40  6.60  5.78 Bucking…    55 female     63
## # … with 393 more rows, and 9 more variables: weight <dbl>, frame <chr>,
## #   bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>, bp.2d <dbl>, waist <dbl>,
## #   hip <dbl>, time.ppn <dbl>

Q13 Recreate the diabetes data by multiple joins

read_csv(file = "data/diabetes.csv")

## # A tibble: 403 x 19
##       id  chol stab.glu   hdl ratio glyhb location   age gender height
##    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>    <dbl> <chr>   <dbl>
##  1  1000   203       82    56  3.60  4.31 Bucking…    46 female     62
##  2  1001   165       97    24  6.90  4.44 Bucking…    29 female     64
##  3  1002   228       92    37  6.20  4.64 Bucking…    58 female     61
##  4  1003    78       93    12  6.5   4.63 Bucking…    67 male       67
##  5  1005   249       90    28  8.90  7.72 Bucking…    64 male       68
##  6  1008   248       94    69  3.60  4.81 Bucking…    34 male       71
##  7  1011   195       92    41  4.80  4.84 Bucking…    30 male       69
##  8  1015   227       75    44  5.20  3.94 Bucking…    37 male       59
##  9  1016   177       87    49  3.60  4.84 Bucking…    45 male       69
## 10  1022   263       89    40  6.60  5.78 Bucking…    55 female     63
## # … with 393 more rows, and 9 more variables: weight <dbl>, frame <chr>,
## #   bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>, bp.2d <dbl>, waist <dbl>,
## #   hip <dbl>, time.ppn <dbl>

Q14-19 long/wide data

diabetes_data_long = diabetes_data %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = -c("id"), names_to = "vars", values_to = "value")

Q14-19 long/wide data

diabetes_data_long <- diabetes_data %>%
  mutate(gender_bin = case_when(gender == "female" ~ 1,
                                gender == "male" ~ 0)) %>% 
  select_if(is.numeric) %>%
  pivot_longer(cols = -c("id", "gender_bin"),
               names_to = "vars",
               values_to = "value") %>% 
  mutate(id = factor(id),
         gender_bin = factor(gender_bin))
diabetes_data_long

## # A tibble: 6,045 x 4
##    id    gender_bin vars    value
##    <fct> <fct>      <chr>   <dbl>
##  1 1000  1          age     46   
##  2 1000  1          bp.1d   59   
##  3 1000  1          bp.1s  118   
##  4 1000  1          bp.2d   NA   
##  5 1000  1          bp.2s   NA   
##  6 1000  1          chol   203   
##  7 1000  1          glyhb    4.31
##  8 1000  1          hdl     56   
##  9 1000  1          height  62   
## 10 1000  1          hip     38   
## # … with 6,035 more rows

Q14-19 long/wide data

diabetes_data_long %>%
  ggplot(aes(x = id, y = value, colour = gender_bin)) +
  geom_point() +
  facet_wrap(~vars, nrow = 5, scales = "free_y") +
  labs(x = "") +
  theme(axis.text.x = element_blank())