- 08.00 - 08.30 Recap of exercises from last class
- 08.30 - 09.00 Introduction to modelling, dimension reduction and clustering
- 09.00 - 12.00 Exercises in Modelling, dimension reduction and clustering
March 2nd 2020
iris %>% as_tibble
## # A tibble: 150 x 5 ## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <dbl> <dbl> <dbl> <dbl> <fct> ## 1 5.1 3.5 1.4 0.2 setosa ## 2 4.9 3 1.4 0.2 setosa ## 3 4.7 3.2 1.3 0.2 setosa ## 4 4.6 3.1 1.5 0.2 setosa ## 5 5 3.6 1.4 0.2 setosa ## 6 5.4 3.9 1.7 0.4 setosa ## 7 4.6 3.4 1.4 0.3 setosa ## 8 5 3.4 1.5 0.2 setosa ## 9 4.4 2.9 1.4 0.2 setosa ## 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows
pl1 <- iris %>% as_tibble %>% ggplot(aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) + geom_point() + theme_bw() + theme(legend.position = "none") pl2 <- iris %>% as_tibble %>% ggplot(aes(x = Petal.Length, y = Petal.Width, colour = Species)) + geom_point() + theme_bw() + theme(legend.position = "none") pl3 <- iris %>% as_tibble %>% ggplot(aes(x = Sepal.Length, y = Petal.Width, colour = Species)) + geom_point() + theme_bw() + theme(legend.position = "none") pl4 <- iris %>% as_tibble %>% ggplot(aes(x = Petal.Length, y = Sepal.Width, colour = Species)) + geom_point() + theme_bw() + theme(legend.position = "none")
( pl1 + pl2 ) / ( pl3 + pl4 ) # "patchwork" package trick
my_iris <- iris %>% as_tibble %>% select(-Species) my_iris
## # A tibble: 150 x 4 ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## <dbl> <dbl> <dbl> <dbl> ## 1 5.1 3.5 1.4 0.2 ## 2 4.9 3 1.4 0.2 ## 3 4.7 3.2 1.3 0.2 ## 4 4.6 3.1 1.5 0.2 ## 5 5 3.6 1.4 0.2 ## 6 5.4 3.9 1.7 0.4 ## 7 4.6 3.4 1.4 0.3 ## 8 5 3.4 1.5 0.2 ## 9 4.4 2.9 1.4 0.2 ## 10 4.9 3.1 1.5 0.1 ## # … with 140 more rows
my_pca <- my_iris %>% prcomp(center = TRUE, scale. = TRUE) my_pca
## Standard deviations (1, .., p=4): ## [1] 1.7083611 0.9560494 0.3830886 0.1439265 ## ## Rotation (n x k) = (4 x 4): ## PC1 PC2 PC3 PC4 ## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863 ## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096 ## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492 ## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
my_pca %>% tidy("pcs")
## # A tibble: 4 x 4 ## PC std.dev percent cumulative ## <dbl> <dbl> <dbl> <dbl> ## 1 1 1.71 0.730 0.730 ## 2 2 0.956 0.229 0.958 ## 3 3 0.383 0.0367 0.995 ## 4 4 0.144 0.00518 1
my_pca %>% tidy("pcs") %>% ggplot(aes(x = PC, y = percent)) + geom_col() + theme_bw()
my_pca %>% tidy("samples")
## # A tibble: 600 x 3 ## row PC value ## <int> <dbl> <dbl> ## 1 1 1 -2.26 ## 2 2 1 -2.07 ## 3 3 1 -2.36 ## 4 4 1 -2.29 ## 5 5 1 -2.38 ## 6 6 1 -2.07 ## 7 7 1 -2.44 ## 8 8 1 -2.23 ## 9 9 1 -2.33 ## 10 10 1 -2.18 ## # … with 590 more rows
my_pca_aug <- my_pca %>% augment(iris) my_pca_aug
## # A tibble: 150 x 10 ## .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 1 5.1 3.5 1.4 0.2 setosa ## 2 2 4.9 3 1.4 0.2 setosa ## 3 3 4.7 3.2 1.3 0.2 setosa ## 4 4 4.6 3.1 1.5 0.2 setosa ## 5 5 5 3.6 1.4 0.2 setosa ## 6 6 5.4 3.9 1.7 0.4 setosa ## 7 7 4.6 3.4 1.4 0.3 setosa ## 8 8 5 3.4 1.5 0.2 setosa ## 9 9 4.4 2.9 1.4 0.2 setosa ## 10 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows, and 4 more variables: .fittedPC1 <dbl>, ## # .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>
my_pca_aug %>% ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = Species)) + geom_point()
my_k_org <- my_pca_aug %>% select(contains("Sepal"), contains("Petal")) %>% kmeans(centers = 3) my_k_org
## K-means clustering with 3 clusters of sizes 38, 62, 50 ## ## Cluster means: ## Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 6.850000 3.073684 5.742105 2.071053 ## 2 5.901613 2.748387 4.393548 1.433871 ## 3 5.006000 3.428000 1.462000 0.246000 ## ## Clustering vector: ## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## [71] 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 ## [106] 1 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 ## [141] 1 1 2 1 1 1 2 1 1 2 ## ## Within cluster sum of squares by cluster: ## [1] 23.87947 39.82097 15.15100 ## (between_SS / total_SS = 88.4 %) ## ## Available components: ## ## [1] "cluster" "centers" "totss" "withinss" ## [5] "tot.withinss" "betweenss" "size" "iter" ## [9] "ifault"
my_pca_aug_k_org <- my_k_org %>% augment(my_pca_aug) %>% rename(cluster_org = .cluster) my_pca_aug_k_org
## # A tibble: 150 x 11 ## .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 1 5.1 3.5 1.4 0.2 setosa ## 2 2 4.9 3 1.4 0.2 setosa ## 3 3 4.7 3.2 1.3 0.2 setosa ## 4 4 4.6 3.1 1.5 0.2 setosa ## 5 5 5 3.6 1.4 0.2 setosa ## 6 6 5.4 3.9 1.7 0.4 setosa ## 7 7 4.6 3.4 1.4 0.3 setosa ## 8 8 5 3.4 1.5 0.2 setosa ## 9 9 4.4 2.9 1.4 0.2 setosa ## 10 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows, and 5 more variables: .fittedPC1 <dbl>, ## # .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>, ## # cluster_org <fct>
my_k_pca <- my_pca_aug_k_org %>% select(.fittedPC1, .fittedPC2) %>% kmeans(centers = 3) my_k_pca
## K-means clustering with 3 clusters of sizes 50, 53, 47 ## ## Cluster means: ## .fittedPC1 .fittedPC2 ## 1 -2.2173249 -0.2879627 ## 2 0.5707095 0.8045137 ## 3 1.7152903 -0.6008742 ## ## Clustering vector: ## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 ## [71] 3 2 2 2 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 ## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 ## [141] 3 3 2 3 3 3 2 3 3 2 ## ## Within cluster sum of squares by cluster: ## [1] 44.58037 34.85204 34.82154 ## (between_SS / total_SS = 80.0 %) ## ## Available components: ## ## [1] "cluster" "centers" "totss" "withinss" ## [5] "tot.withinss" "betweenss" "size" "iter" ## [9] "ifault"
my_pca_aug_k_org_pca <- my_k_pca %>% augment(my_pca_aug_k_org) %>% rename(cluster_pca = .cluster) my_pca_aug_k_org_pca
## # A tibble: 150 x 12 ## .rownames Sepal.Length Sepal.Width Petal.Length Petal.Width Species ## <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 1 5.1 3.5 1.4 0.2 setosa ## 2 2 4.9 3 1.4 0.2 setosa ## 3 3 4.7 3.2 1.3 0.2 setosa ## 4 4 4.6 3.1 1.5 0.2 setosa ## 5 5 5 3.6 1.4 0.2 setosa ## 6 6 5.4 3.9 1.7 0.4 setosa ## 7 7 4.6 3.4 1.4 0.3 setosa ## 8 8 5 3.4 1.5 0.2 setosa ## 9 9 4.4 2.9 1.4 0.2 setosa ## 10 10 4.9 3.1 1.5 0.1 setosa ## # … with 140 more rows, and 6 more variables: .fittedPC1 <dbl>, ## # .fittedPC2 <dbl>, .fittedPC3 <dbl>, .fittedPC4 <dbl>, ## # cluster_org <fct>, cluster_pca <fct>
pl1 <- my_pca_aug_k_org_pca %>% ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = Species)) + geom_point() + theme(legend.position = "bottom") pl2 <- my_pca_aug_k_org_pca %>% ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = cluster_org)) + geom_point() + theme(legend.position = "bottom") pl3 <- my_pca_aug_k_org_pca %>% ggplot(aes(x = .fittedPC1, y = .fittedPC2, colour = cluster_pca)) + geom_point() + theme(legend.position = "bottom")
(pl1 + pl2 + pl3) # "patchwork" again here
my_pca_aug_k_org_pca %>% select(Species, cluster_org, cluster_pca) %>% mutate(cluster_org = case_when(cluster_org == 1 ~ "virginica", cluster_org == 2 ~ "versicolor", cluster_org == 3 ~ "setosa"), cluster_pca = case_when(cluster_pca == 1 ~ "setosa", cluster_pca == 2 ~ "versicolor", cluster_pca == 3 ~ "virginica"), cluster_org_correct = case_when(Species == cluster_org ~ 1, Species != cluster_org ~ 0), cluster_pca_correct = case_when(Species == cluster_pca ~ 1, Species != cluster_pca ~ 0)) %>% summarise(score_org = mean(cluster_org_correct), score_pca = mean(cluster_pca_correct))
## # A tibble: 1 x 2 ## score_org score_pca ## <dbl> <dbl> ## 1 0.893 0.833