March 2nd 2020

Agenda

  • 08.00 - 08.30 Recap of exercises from last class
  • 08.30 - 09.00 Introduction to modelling, dimension reduction and clustering
  • 09.00 - 12.00 Exercises in Modelling, dimension reduction and clustering

Dimensionality Reduction and Clustering

Example: The iris dataset

iris %>% as_tibble
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows

Why do we need dimensionality reduction?

pl1 <- iris %>%
  as_tibble %>% 
  ggplot(aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "none")

pl2 <- iris %>%
  as_tibble %>% 
  ggplot(aes(x = Petal.Length, y = Petal.Width, colour = Species)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "none")

pl3 <- iris %>%
  as_tibble %>% 
  ggplot(aes(x = Sepal.Length, y = Petal.Width, colour = Species)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "none")

pl4 <- iris %>%
  as_tibble %>% 
  ggplot(aes(x = Petal.Length, y = Sepal.Width, colour = Species)) +
  geom_point() +
  theme_bw() +
  theme(legend.position = "none")

Why do we need dimensionality reduction?

( pl1 + pl2 ) / ( pl3 + pl4 ) # "patchwork" package trick

PCA on Iris

my_iris <- iris %>%
  as_tibble %>%
  select(-Species)
my_iris
## # A tibble: 150 x 4
##    Sepal.Length Sepal.Width Petal.Length Petal.Width
##           <dbl>       <dbl>        <dbl>       <dbl>
##  1          5.1         3.5          1.4         0.2
##  2          4.9         3            1.4         0.2
##  3          4.7         3.2          1.3         0.2
##  4          4.6         3.1          1.5         0.2
##  5          5           3.6          1.4         0.2
##  6          5.4         3.9          1.7         0.4
##  7          4.6         3.4          1.4         0.3
##  8          5           3.4          1.5         0.2
##  9          4.4         2.9          1.4         0.2
## 10          4.9         3.1          1.5         0.1
## # … with 140 more rows

PCA on Iris

my_pca <- my_iris %>%
  prcomp(center = TRUE, scale. = TRUE)
my_pca
## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
## 
## Rotation (n x k) = (4 x 4):
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971

Use broom to tidy

my_pca %>% tidy("pcs")
## # A tibble: 4 x 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.71  0.730        0.730
## 2     2   0.956 0.229        0.958
## 3     3   0.383 0.0367       0.995
## 4     4   0.144 0.00518      1

Use broom to tidy

my_pca %>% tidy("pcs") %>% 
  ggplot(aes(x = PC, y = percent)) +
  geom_col() +
  theme_bw()

Use broom to tidy

my_pca %>% tidy("samples")
## # A tibble: 600 x 3
##      row    PC value
##    <int> <dbl> <dbl>
##  1     1     1 -2.26
##  2     2     1 -2.07
##  3     3     1 -2.36
##  4     4     1 -2.29
##  5     5     1 -2.38
##  6     6     1 -2.07
##  7     7     1 -2.44
##  8     8     1 -2.23
##  9     9     1 -2.33
## 10    10     1 -2.18
## # … with 590 more rows

Use broom to tidy and augment

my_pca_aug <- my_pca %>% augment(iris)
my_pca_aug
## # A tibble: 150 x 10
##    .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <fct>            <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1 1                  5.1         3.5          1.4         0.2 setosa 
##  2 2                  4.9         3            1.4         0.2 setosa 
##  3 3                  4.7         3.2          1.3         0.2 setosa 
##  4 4                  4.6         3.1          1.5         0.2 setosa 
##  5 5                  5           3.6          1.4         0.2 setosa 
##  6 6                  5.4         3.9          1.7         0.4 setosa 
##  7 7                  4.6         3.4          1.4         0.3 setosa 
##  8 8                  5           3.4          1.5         0.2 setosa 
##  9 9                  4.4         2.9          1.4         0.2 setosa 
## 10 10                 4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows, and 4 more variables: .fittedPC1 <dbl>,
## #   .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>

Visualise

my_pca_aug %>% 
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = Species)) +
  geom_point()

Cluster

my_k_org <- my_pca_aug %>%
  select(contains("Sepal"), contains("Petal")) %>%
  kmeans(centers = 3)
my_k_org
## K-means clustering with 3 clusters of sizes 38, 62, 50
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     6.850000    3.073684     5.742105    2.071053
## 2     5.901613    2.748387     4.393548    1.433871
## 3     5.006000    3.428000     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1
## [106] 1 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1
## [141] 1 1 2 1 1 1 2 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 23.87947 39.82097 15.15100
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Cluster

my_pca_aug_k_org <- my_k_org %>%
  augment(my_pca_aug) %>% 
  rename(cluster_org = .cluster)
my_pca_aug_k_org
## # A tibble: 150 x 11
##    .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <fct>            <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1 1                  5.1         3.5          1.4         0.2 setosa 
##  2 2                  4.9         3            1.4         0.2 setosa 
##  3 3                  4.7         3.2          1.3         0.2 setosa 
##  4 4                  4.6         3.1          1.5         0.2 setosa 
##  5 5                  5           3.6          1.4         0.2 setosa 
##  6 6                  5.4         3.9          1.7         0.4 setosa 
##  7 7                  4.6         3.4          1.4         0.3 setosa 
##  8 8                  5           3.4          1.5         0.2 setosa 
##  9 9                  4.4         2.9          1.4         0.2 setosa 
## 10 10                 4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows, and 5 more variables: .fittedPC1 <dbl>,
## #   .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>,
## #   cluster_org <fct>

Cluster

my_k_pca <- my_pca_aug_k_org %>%
  select(.fittedPC1, .fittedPC2) %>%
  kmeans(centers = 3)
my_k_pca
## K-means clustering with 3 clusters of sizes 50, 53, 47
## 
## Cluster means:
##   .fittedPC1 .fittedPC2
## 1 -2.2173249 -0.2879627
## 2  0.5707095  0.8045137
## 3  1.7152903 -0.6008742
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2
##  [71] 3 2 2 2 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 44.58037 34.85204 34.82154
##  (between_SS / total_SS =  80.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Cluster

my_pca_aug_k_org_pca <- my_k_pca %>%
  augment(my_pca_aug_k_org) %>% 
  rename(cluster_pca = .cluster)
my_pca_aug_k_org_pca
## # A tibble: 150 x 12
##    .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <fct>            <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1 1                  5.1         3.5          1.4         0.2 setosa 
##  2 2                  4.9         3            1.4         0.2 setosa 
##  3 3                  4.7         3.2          1.3         0.2 setosa 
##  4 4                  4.6         3.1          1.5         0.2 setosa 
##  5 5                  5           3.6          1.4         0.2 setosa 
##  6 6                  5.4         3.9          1.7         0.4 setosa 
##  7 7                  4.6         3.4          1.4         0.3 setosa 
##  8 8                  5           3.4          1.5         0.2 setosa 
##  9 9                  4.4         2.9          1.4         0.2 setosa 
## 10 10                 4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows, and 6 more variables: .fittedPC1 <dbl>,
## #   .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>,
## #   cluster_org <fct>, cluster_pca <fct>

Cluster

pl1 <- my_pca_aug_k_org_pca %>%
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = Species)) +
  geom_point() +
  theme(legend.position = "bottom")

pl2 <- my_pca_aug_k_org_pca %>%
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = cluster_org)) +
  geom_point() +
  theme(legend.position = "bottom")

pl3 <- my_pca_aug_k_org_pca %>%
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = cluster_pca)) +
  geom_point() +
  theme(legend.position = "bottom")

Cluster

(pl1 + pl2 + pl3) # "patchwork" again here

Who won?

my_pca_aug_k_org_pca %>%
  select(Species, cluster_org, cluster_pca) %>%
  mutate(cluster_org = case_when(cluster_org == 1 ~ "virginica",
                                 cluster_org == 2 ~ "versicolor",
                                 cluster_org == 3 ~ "setosa"),
         cluster_pca = case_when(cluster_pca == 1 ~ "setosa",
                                 cluster_pca == 2 ~ "versicolor",
                                 cluster_pca == 3 ~ "virginica"),
         cluster_org_correct = case_when(Species == cluster_org ~ 1,
                                         Species != cluster_org ~ 0),
         cluster_pca_correct = case_when(Species == cluster_pca ~ 1,
                                         Species != cluster_pca ~ 0)) %>% 
  summarise(score_org = mean(cluster_org_correct),
            score_pca = mean(cluster_pca_correct))
## # A tibble: 1 x 2
##   score_org score_pca
##       <dbl>     <dbl>
## 1     0.893     0.833