March 9th 2020

Agenda

  • 08.00 - 08.30 Recap of exercises from last class
  • 08.30 - 09.00 Introduction to Scripting in a Reproducible and Collaborative Framework using GitHub via RStudio
  • 09.00 - 12.00 Exercises in Scripting in a Reproducible and Collaborative Framework using GitHub via RStudio

Many Models

T1: Re-create facetted weight ~ height plot

Load data

diabetes_data <- read_csv(file = "data/diabetes_metric.csv")
diabetes_data
## # A tibble: 403 x 19
##       id  chol stab.glu   hdl ratio glyhb location   age gender height
##    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <chr>    <dbl> <chr>   <dbl>
##  1  1000   203       82    56  3.60  4.31 Bucking…    46 female   157.
##  2  1001   165       97    24  6.90  4.44 Bucking…    29 female   163.
##  3  1002   228       92    37  6.20  4.64 Bucking…    58 female   155.
##  4  1003    78       93    12  6.5   4.63 Bucking…    67 male     170.
##  5  1005   249       90    28  8.90  7.72 Bucking…    64 male     173.
##  6  1008   248       94    69  3.60  4.81 Bucking…    34 male     180.
##  7  1011   195       92    41  4.80  4.84 Bucking…    30 male     175.
##  8  1015   227       75    44  5.20  3.94 Bucking…    37 male     150.
##  9  1016   177       87    49  3.60  4.84 Bucking…    45 male     175.
## 10  1022   263       89    40  6.60  5.78 Bucking…    55 female   160.
## # … with 393 more rows, and 9 more variables: weight <dbl>, frame <chr>,
## #   bp.1s <dbl>, bp.1d <dbl>, bp.2s <dbl>, bp.2d <dbl>, waist <dbl>,
## #   hip <dbl>, time.ppn <dbl>

T1: Re-create facetted weight ~ height plot

Plot

diabetes_data %>%
  ggplot(aes(x = height, y = weight)) +
  geom_point() +
  facet_grid(rows = vars(gender),
             cols = vars(location)) +
  geom_smooth(method = "lm")

T6: Re-create coefficient plot

Add mdls

get_mdls <- function(df){
  return( lm(weight ~ height, data = df) )
}
diabetes_data_mdl <- diabetes_data %>%
  group_by(location, gender) %>%
  nest %>%
  mutate(mdls = map(data, get_mdls))
diabetes_data_mdl
## # A tibble: 4 x 4
## # Groups:   location, gender [4]
##   location   gender            data mdls  
##   <chr>      <chr>  <list<df[,17]>> <list>
## 1 Buckingham female      [114 × 17] <lm>  
## 2 Buckingham male         [86 × 17] <lm>  
## 3 Louisa     female      [120 × 17] <lm>  
## 4 Louisa     male         [83 × 17] <lm>

T6: Re-create coefficient plot

Add terms and confidence intervals

diabetes_data_mdl <- diabetes_data_mdl %>% 
  mutate(tidy = map(mdls, tidy, conf.int = TRUE)) %>% 
  unnest(tidy)
diabetes_data_mdl %>% select(term:conf.high) # Note the notation for variable subset
## Adding missing grouping variables: `location`, `gender`
## # A tibble: 8 x 9
## # Groups:   location, gender [4]
##   location gender term  estimate std.error statistic p.value conf.low
##   <chr>    <chr>  <chr>    <dbl>     <dbl>     <dbl>   <dbl>    <dbl>
## 1 Bucking… female (Int…  -34.5      44.4      -0.778 4.39e-1 -1.22e+2
## 2 Bucking… female heig…    0.699     0.272     2.57  1.15e-2  1.60e-1
## 3 Bucking… male   (Int…   -9.57     42.2      -0.227 8.21e-1 -9.36e+1
## 4 Bucking… male   heig…    0.529     0.239     2.21  2.96e-2  5.36e-2
## 5 Louisa   female (Int…  -36.4      33.1      -1.10  2.74e-1 -1.02e+2
## 6 Louisa   female heig…    0.719     0.205     3.50  6.63e-4  3.12e-1
## 7 Louisa   male   (Int…  -40.1      49.3      -0.812 4.19e-1 -1.38e+2
## 8 Louisa   male   heig…    0.696     0.283     2.46  1.60e-2  1.33e-1
## # … with 1 more variable: conf.high <dbl>

T6: Re-create coefficient plot

Plot

diabetes_data_mdl %>% 
  filter(term == "height") %>% 
  ggplot(aes(x = estimate,
             y = reorder(str_c(location,
                               "_",
                               gender),
                         desc(estimate)))) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low,
                     xmax = conf.high,
                     height = 0.2)) +
  labs(y = "",
       title = str_c("Estimated increase",
                     "in weight\nper 1 cm",
                     "gain in height")) +
  theme_bw()

Note the newline trick in the title and the re-ordering

Clustering and dimensionality reduction

Let us take a look at the BLOSUM62 matrix

Load the data

my_url <- "https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt"
bl62 <- read_table(file = my_url, comment = '#') %>%
  rename(aa = X1)
bl62
## # A tibble: 24 x 25
##    aa        A     R     N     D     C     Q     E     G     H     I     L
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 A         4    -1    -2    -2     0    -1    -1     0    -2    -1    -1
##  2 R        -1     5     0    -2    -3     1     0    -2     0    -3    -2
##  3 N        -2     0     6     1    -3     0     0     0     1    -3    -3
##  4 D        -2    -2     1     6    -3     0     2    -1    -1    -3    -4
##  5 C         0    -3    -3    -3     9    -3    -4    -3    -3    -1    -1
##  6 Q        -1     1     0     0    -3     5     2    -2     0    -3    -2
##  7 E        -1     0     0     2    -4     2     5    -2     0    -3    -3
##  8 G         0    -2     0    -1    -3    -2    -2     6    -2    -4    -4
##  9 H        -2     0     1    -1    -3     0     0    -2     8    -3    -3
## 10 I        -1    -3    -3    -3    -1    -3    -3    -4    -3     4     2
## # … with 14 more rows, and 13 more variables: K <dbl>, M <dbl>, F <dbl>,
## #   P <dbl>, S <dbl>, T <dbl>, W <dbl>, Y <dbl>, V <dbl>, B <dbl>,
## #   Z <dbl>, X <dbl>, `*` <dbl>

Let us take a look at the BLOSUM62 matrix

Wrangle and save the data

bl62 <- bl62 %>%
  select(aa:V) %>%
  slice(1:20) %>%
  write_tsv(path = "data/BLOSUM62_ncbi.tsv")
bl62
## # A tibble: 20 x 21
##    aa        A     R     N     D     C     Q     E     G     H     I     L
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 A         4    -1    -2    -2     0    -1    -1     0    -2    -1    -1
##  2 R        -1     5     0    -2    -3     1     0    -2     0    -3    -2
##  3 N        -2     0     6     1    -3     0     0     0     1    -3    -3
##  4 D        -2    -2     1     6    -3     0     2    -1    -1    -3    -4
##  5 C         0    -3    -3    -3     9    -3    -4    -3    -3    -1    -1
##  6 Q        -1     1     0     0    -3     5     2    -2     0    -3    -2
##  7 E        -1     0     0     2    -4     2     5    -2     0    -3    -3
##  8 G         0    -2     0    -1    -3    -2    -2     6    -2    -4    -4
##  9 H        -2     0     1    -1    -3     0     0    -2     8    -3    -3
## 10 I        -1    -3    -3    -3    -1    -3    -3    -4    -3     4     2
## 11 L        -1    -2    -3    -4    -1    -2    -3    -4    -3     2     4
## 12 K        -1     2     0    -1    -3     1     1    -2    -1    -3    -2
## 13 M        -1    -1    -2    -3    -1     0    -2    -3    -2     1     2
## 14 F        -2    -3    -3    -3    -2    -3    -3    -3    -1     0     0
## 15 P        -1    -2    -2    -1    -3    -1    -1    -2    -2    -3    -3
## 16 S         1    -1     1     0    -1     0     0     0    -1    -2    -2
## 17 T         0    -1     0    -1    -1    -1    -1    -2    -2    -1    -1
## 18 W        -3    -3    -4    -4    -2    -2    -3    -2    -2    -3    -2
## 19 Y        -2    -2    -2    -3    -2    -1    -2    -3     2    -1    -1
## 20 V         0    -3    -3    -3    -1    -2    -2    -3    -3     3     1
## # … with 9 more variables: K <dbl>, M <dbl>, F <dbl>, P <dbl>, S <dbl>,
## #   T <dbl>, W <dbl>, Y <dbl>, V <dbl>

Create a PCA object

bl62_pca <- bl62 %>%
  select(-aa) %>%
  prcomp(center = TRUE, scale = TRUE)
bl62_pca
## Standard deviations (1, .., p=20):
##  [1] 2.797941e+00 1.875925e+00 1.510557e+00 1.098029e+00 1.041474e+00
##  [6] 8.852846e-01 8.712434e-01 8.510922e-01 7.611395e-01 6.225610e-01
## [11] 5.456283e-01 3.885950e-01 3.548686e-01 2.932054e-01 2.456744e-01
## [16] 2.209702e-01 2.159492e-01 1.710774e-01 2.331238e-02 1.866763e-17
## 
## Rotation (n x k) = (20 x 20):
##           PC1         PC2          PC3         PC4          PC5
## A -0.02198349 -0.39538499 -0.139051080 -0.22657914 -0.263224481
## R -0.22754147  0.01561304  0.357636019 -0.11241231 -0.403375706
## N -0.28616836  0.01152475  0.029140421 -0.31756625  0.242245908
## D -0.28146092 -0.01625711 -0.051165304  0.06968447  0.444971920
## C  0.16270548 -0.18987820 -0.227480422 -0.15116482 -0.233942011
## Q -0.25263530  0.03559256  0.362042032  0.01254120 -0.123679874
## E -0.29426420  0.01575092  0.229898401  0.07052766  0.171531182
## G -0.18683236 -0.01174422 -0.406327779 -0.13346507 -0.159921857
## H -0.13915957  0.32247165  0.141902636 -0.28694892  0.183676582
## I  0.28852893 -0.21316474  0.181878822 -0.04193903  0.254405796
## L  0.28982857 -0.17427520  0.273377988 -0.05943833  0.004623112
## K -0.25294078 -0.05053338  0.344859006  0.01051131 -0.295041203
## M  0.22994912 -0.14396566  0.367768349 -0.02497593 -0.104716241
## F  0.26327526  0.24626518  0.042356907 -0.23942010  0.133850814
## P -0.16147420 -0.16577760 -0.042768816  0.54480795  0.100283867
## S -0.23675205 -0.24366002 -0.093519765 -0.40438304  0.032237595
## T -0.05400399 -0.33428257  0.008077785 -0.33907581  0.175000699
## W  0.15927635  0.33784422 -0.112173478 -0.02503267 -0.280376814
## Y  0.16793698  0.38269517  0.095713743 -0.25058485  0.086073161
## V  0.25186407 -0.28706514  0.179458469  0.01025022  0.210098977
##            PC6         PC7          PC8         PC9        PC10
## A  0.024391248 -0.15166178 -0.414285750  0.40156077 -0.14180645
## R -0.082115015 -0.08895210  0.008088644 -0.24493802 -0.25257521
## N -0.004966949 -0.14068683  0.183809121 -0.36611315  0.20004984
## D  0.028503803  0.03630896  0.297241397  0.29928610 -0.01237390
## C -0.667375217  0.07528443  0.301942413  0.19556260  0.15759459
## Q  0.051805646  0.12467347  0.013973825  0.33591873  0.31517595
## E  0.083024380  0.09011289  0.053578299  0.43968482 -0.15791385
## G  0.465000574 -0.34965648 -0.065100815 -0.07179044  0.09511342
## H -0.374145722 -0.22567256 -0.331133363 -0.06758526  0.09696424
## I  0.049650277 -0.16300840  0.019120153  0.00379726 -0.12324843
## L  0.081055788 -0.08801339  0.048412891 -0.07501076  0.10669709
## K  0.016193944  0.02132172  0.013797401 -0.05152272 -0.24346834
## M  0.177017459 -0.05506614  0.084656530 -0.02470751  0.59136686
## F  0.138762983  0.03281329 -0.142590642  0.12354320 -0.01467182
## P -0.152539038  0.16921662 -0.558165679 -0.17744924  0.29321770
## S  0.074021157  0.11082446 -0.104405603  0.12745397  0.32801026
## T  0.049477089  0.64309889 -0.089664584 -0.30648839 -0.17741155
## W  0.272044306  0.48395806  0.053197948  0.07886306  0.08380288
## Y -0.080057336  0.08577378 -0.351416052  0.15025856 -0.02897764
## V  0.095643147 -0.10251361 -0.104067646  0.07986799 -0.19919881
##           PC11        PC12        PC13        PC14         PC15
## A -0.063108456  0.18462817 -0.07299429  0.15214090 -0.329038142
## R  0.030472748  0.38528503  0.21871103  0.05586227 -0.264625626
## N  0.006241077 -0.25295051  0.05484400  0.36372201 -0.346001160
## D  0.128800583  0.20188813  0.04263511 -0.19325592 -0.357970408
## C  0.018665519 -0.16293756  0.13826205 -0.15754646  0.009835422
## Q -0.287080555 -0.09484809  0.45137837  0.18121757  0.186737014
## E -0.083711130  0.04022274 -0.16816406 -0.02666800  0.197606900
## G -0.162028832 -0.13206453  0.22378957 -0.30968142  0.191806496
## H -0.356180222  0.14162508 -0.37253339 -0.27709690  0.105173989
## I -0.048822141 -0.08315912 -0.07837502  0.15604097 -0.111018057
## L  0.065413578  0.16154141 -0.12753030  0.32147350  0.418022399
## K  0.373280975 -0.50338323 -0.24824886 -0.29566568  0.034941489
## M -0.021609948  0.10697778 -0.05678931 -0.42584666 -0.269936441
## F  0.534086971  0.22834347  0.22044150 -0.19590712  0.018260224
## P  0.157129328 -0.03198195  0.05268734  0.01901796 -0.109255616
## S  0.336221614 -0.07254150 -0.28962862  0.16794721  0.146172198
## T -0.192646507  0.11345774  0.17014048 -0.24546037  0.134813435
## W -0.225870481 -0.07146648 -0.37387252  0.12814446 -0.310921776
## Y  0.087896091 -0.33902867  0.32869186  0.09985199 -0.109947805
## V -0.265073047 -0.38583329  0.05224703 -0.15161307 -0.173936581
##          PC16         PC17         PC18        PC19        PC20
## A -0.23464235  0.236983563 -0.144106847  0.09712420 -0.10202189
## R  0.22601683 -0.191364864  0.261643801 -0.15323716  0.23711023
## N -0.44996175  0.020002484 -0.029530779  0.00416425  0.08447424
## D  0.29523859  0.418434056 -0.027737591 -0.09391774  0.19578935
## C -0.10000496 -0.033748261  0.130759970  0.02404972  0.32107817
## Q  0.09075458 -0.034165963 -0.420417243  0.05376834  0.07661953
## E -0.43206777 -0.240091948  0.499772637  0.09672172  0.09822353
## G  0.03842400  0.068158734  0.181547905  0.09476928  0.35356820
## H  0.04961494  0.045662238 -0.164916077 -0.01346186  0.14292879
## I  0.29386006 -0.229938958 -0.032716963  0.66265301  0.30922992
## L -0.06397791  0.525505129  0.132732334 -0.20434795  0.33062881
## K -0.02131906  0.243640204 -0.197164105  0.13287963  0.07207959
## M -0.09228723  0.024645413  0.202077721  0.10376690 -0.24004250
## F -0.30073959 -0.232314713 -0.297386138 -0.06207960  0.27470234
## P -0.05902601 -0.012758186  0.089101639  0.01593712  0.32279888
## S  0.40675640 -0.303853910  0.097051288 -0.19209227 -0.05394004
## T -0.05508884  0.111343805 -0.001829995  0.12848334 -0.02999706
## W  0.01146515  0.002576573 -0.027562418 -0.03934627  0.36459698
## Y  0.18055471  0.271796810  0.446011947  0.07206397 -0.14813975
## V  0.02839452 -0.208127062 -0.038369288 -0.60225588  0.13323506

Make a scree plot using broom to tidy

bl62_pca %>%
  tidy("pcs") %>% 
  ggplot(aes(x = PC, y = percent)) +
  geom_col() +
  theme_bw()

Augment using broom

bl62_pca_aug <- bl62_pca %>%
  augment(bl62)
bl62_pca_aug
## # A tibble: 20 x 42
##    .rownames aa        A     R     N     D     C     Q     E     G     H
##    <fct>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 1         A         4    -1    -2    -2     0    -1    -1     0    -2
##  2 2         R        -1     5     0    -2    -3     1     0    -2     0
##  3 3         N        -2     0     6     1    -3     0     0     0     1
##  4 4         D        -2    -2     1     6    -3     0     2    -1    -1
##  5 5         C         0    -3    -3    -3     9    -3    -4    -3    -3
##  6 6         Q        -1     1     0     0    -3     5     2    -2     0
##  7 7         E        -1     0     0     2    -4     2     5    -2     0
##  8 8         G         0    -2     0    -1    -3    -2    -2     6    -2
##  9 9         H        -2     0     1    -1    -3     0     0    -2     8
## 10 10        I        -1    -3    -3    -3    -1    -3    -3    -4    -3
## 11 11        L        -1    -2    -3    -4    -1    -2    -3    -4    -3
## 12 12        K        -1     2     0    -1    -3     1     1    -2    -1
## 13 13        M        -1    -1    -2    -3    -1     0    -2    -3    -2
## 14 14        F        -2    -3    -3    -3    -2    -3    -3    -3    -1
## 15 15        P        -1    -2    -2    -1    -3    -1    -1    -2    -2
## 16 16        S         1    -1     1     0    -1     0     0     0    -1
## 17 17        T         0    -1     0    -1    -1    -1    -1    -2    -2
## 18 18        W        -3    -3    -4    -4    -2    -2    -3    -2    -2
## 19 19        Y        -2    -2    -2    -3    -2    -1    -2    -3     2
## 20 20        V         0    -3    -3    -3    -1    -2    -2    -3    -3
## # … with 31 more variables: I <dbl>, L <dbl>, K <dbl>, M <dbl>, F <dbl>,
## #   P <dbl>, S <dbl>, T <dbl>, W <dbl>, Y <dbl>, V <dbl>,
## #   .fittedPC1 <dbl>, .fittedPC2 <dbl>, .fittedPC3 <dbl>,
## #   .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## #   .fittedPC7 <dbl>, .fittedPC8 <dbl>, .fittedPC9 <dbl>,
## #   .fittedPC10 <dbl>, .fittedPC11 <dbl>, .fittedPC12 <dbl>,
## #   .fittedPC13 <dbl>, .fittedPC14 <dbl>, .fittedPC15 <dbl>,
## #   .fittedPC16 <dbl>, .fittedPC17 <dbl>, .fittedPC18 <dbl>,
## #   .fittedPC19 <dbl>, .fittedPC20 <dbl>

Add some chemical classes

get_chem_class <- function(x){
  chem_cols <- c("A" = "Hydrophobic", "R" = "Basic", "N" = "Neutral", "D" = "Acidic",
                 "C" = "sulphur", "Q" = "Neutral", "E" = "Acidic", "G" = "Polar",
                 "H" = "Basic", "I" = "Hydrophobic", "L" = "Hydrophobic", "K" = "Basic",
                 "M" = "sulphur", "F" = "Hydrophobic", "P" = "Hydrophobic", "S" = "Polar",
                 "T" = "Polar", "W" = "Hydrophobic", "Y" = "Polar", "V" = "Hydrophobic")
  return(factor(chem_cols[x]))
}
bl62_pca_aug <- bl62_pca_aug %>% 
  mutate(chem_class = get_chem_class(aa))
bl62_pca_aug %>% select(aa, chem_class)
## # A tibble: 20 x 2
##    aa    chem_class 
##    <chr> <fct>      
##  1 A     Hydrophobic
##  2 R     Basic      
##  3 N     Neutral    
##  4 D     Acidic     
##  5 C     sulphur    
##  6 Q     Neutral    
##  7 E     Acidic     
##  8 G     Polar      
##  9 H     Basic      
## 10 I     Hydrophobic
## 11 L     Hydrophobic
## 12 K     Basic      
## 13 M     sulphur    
## 14 F     Hydrophobic
## 15 P     Hydrophobic
## 16 S     Polar      
## 17 T     Polar      
## 18 W     Hydrophobic
## 19 Y     Polar      
## 20 V     Hydrophobic

Plot the PCA

bl62_pca_aug %>% 
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, label = aa, colour = chem_class)) +
  geom_text() +
  theme(legend.position = "bottom")

Plot the PCA

bl62_pca_aug %>% pull(chem_class) %>% levels
## [1] "Acidic"      "Basic"       "Hydrophobic" "Neutral"     "Polar"      
## [6] "sulphur"
# Bad code style to make it fit on slide
x <- bl62_pca %>% tidy("pcs") %>% filter(PC==1) %>% pull(percent)
x <- str_c("PC1 (", round(x*100, 2), "%)")
y <- bl62_pca %>% tidy("pcs") %>% filter(PC==2) %>% pull(percent)
y <- str_c("PC2 (", round(y*100, 2), "%)")
bl62_pca_aug %>% 
  ggplot(aes(x = .fittedPC1, y = .fittedPC2,
             label = aa, colour = chem_class)) +
  geom_label_repel() + # library("ggrepel") trick here
  theme(legend.position = "bottom") +
  scale_colour_manual(values = c("red", "blue", "black",
                                 "purple", "green", "yellow")) +
  labs(x = x, y = y)

Plot the PCA

Up: Aromatic, down: Aliphatic, Left: Charged, Right: Hydrophobic

Add k-means

bl62 %>%
  select(-aa) %>%
  kmeans(centers = 6, iter.max = 1000, nstart = 10) %>%
  augment(bl62_pca_aug) %>% 
  head
## # A tibble: 6 x 44
##   .rownames aa        A     R     N     D     C     Q     E     G     H
##   <fct>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1         A         4    -1    -2    -2     0    -1    -1     0    -2
## 2 2         R        -1     5     0    -2    -3     1     0    -2     0
## 3 3         N        -2     0     6     1    -3     0     0     0     1
## 4 4         D        -2    -2     1     6    -3     0     2    -1    -1
## 5 5         C         0    -3    -3    -3     9    -3    -4    -3    -3
## 6 6         Q        -1     1     0     0    -3     5     2    -2     0
## # … with 33 more variables: I <dbl>, L <dbl>, K <dbl>, M <dbl>, F <dbl>,
## #   P <dbl>, S <dbl>, T <dbl>, W <dbl>, Y <dbl>, V <dbl>,
## #   .fittedPC1 <dbl>, .fittedPC2 <dbl>, .fittedPC3 <dbl>,
## #   .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## #   .fittedPC7 <dbl>, .fittedPC8 <dbl>, .fittedPC9 <dbl>,
## #   .fittedPC10 <dbl>, .fittedPC11 <dbl>, .fittedPC12 <dbl>,
## #   .fittedPC13 <dbl>, .fittedPC14 <dbl>, .fittedPC15 <dbl>,
## #   .fittedPC16 <dbl>, .fittedPC17 <dbl>, .fittedPC18 <dbl>,
## #   .fittedPC19 <dbl>, .fittedPC20 <dbl>, chem_class <fct>, .cluster <fct>

Add k-means

Questions?