Load Libraries
library("tidyverse")
library("ggseqlogo")
Load and Clean Data
Set Analysis Parameters
target_peptide <- "GILGFVFTL"
iedb_file <- "data/tcell_receptor_table_export_1578653238.csv"
single_cell_file <- "data/vdj_v1_hs_aggregated_donor1_binarized_matrix.csv"
size <- 1000
sc_target_allele <- "A0201"
IEDB Data
iedb_data <- iedb_file %>%
read_csv %>%
select(peptide = Description, CDR3 = `Chain 2 CDR3 Curated`) %>%
drop_na %>%
filter(peptide == target_peptide) %>%
filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>%
filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>%
mutate(CDR3 = case_when(str_detect(CDR3, "^C.+[FW]$") ~ str_sub(CDR3, 2, -2),
str_detect(CDR3, "^C") ~ str_sub(CDR3, 2),
str_detect(CDR3, "C.+[FW]G.{1}G") ~ str_match(CDR3, "C(.+)[FW]G.{1}G")[,2],
TRUE ~ CDR3)) %>%
distinct
iedb_data
## # A tibble: 4,221 x 2
## peptide CDR3
## <chr> <chr>
## 1 GILGFVFTL ASSSRSSYEQY
## 2 GILGFVFTL ASSIRSSYEQY
## 3 GILGFVFTL ASSLIYPGELF
## 4 GILGFVFTL ASSIGVYGYT
## 5 GILGFVFTL ASSIRAAETQY
## 6 GILGFVFTL ASSTFGANVLT
## 7 GILGFVFTL ASSLLGGWSEAF
## 8 GILGFVFTL ASSIRSAYEQY
## 9 GILGFVFTL ASSIRSSTEAF
## 10 GILGFVFTL ASSRRSSDEQY
## # … with 4,211 more rows
10x Data
single_cell_data <- single_cell_file %>%
read_csv %>%
mutate(CDR3 = cell_clono_cdr3_aa %>%
str_match("TRB:([ARNDCQEGHILKMFPSTWYV]+)") %>%
.[,2] %>%
str_sub(2, -2)) %>%
select(CDR3, contains(sc_target_allele)) %>%
select(CDR3, contains(target_peptide)) %>%
pivot_longer(cols = -c("CDR3"),
names_to = "peptide",
values_to = "binder") %>%
mutate(peptide = peptide %>% str_split("_") %>% map(2) %>% unlist,
binder = case_when(binder > 0 ~ 1,
TRUE ~ 0)) %>%
distinct %>%
filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>%
filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>%
group_by(CDR3) %>%
filter(n() == 1) %>%
ungroup
single_cell_data
## # A tibble: 23,380 x 3
## CDR3 peptide binder
## <chr> <chr> <dbl>
## 1 ASDTPVGQF GILGFVFTL 0
## 2 ASSGGSISTDTQY GILGFVFTL 0
## 3 ASSQDPAGGYNEQF GILGFVFTL 0
## 4 ASAVGLTYNEQF GILGFVFTL 0
## 5 ASSRPSTGENYGYT GILGFVFTL 0
## 6 ASSPEAYEQY GILGFVFTL 0
## 7 ASINRGRSSYNEQF GILGFVFTL 0
## 8 ASSLEPGLPAGELF GILGFVFTL 0
## 9 ASSETSRNTEAF GILGFVFTL 0
## 10 ASSLGTVGYGYT GILGFVFTL 0
## # … with 23,370 more rows
single_cell_data %>%
count(binder)
## # A tibble: 2 x 2
## binder n
## <dbl> <int>
## 1 0 23118
## 2 1 262
Write data to file
iedb_data %>%
select(CDR3) %>%
sample_n(size, replace = TRUE) %>%
write_tsv(path = str_c("data/database_iedb_TCR_n_",
size,
"_",
target_peptide,
"_",
sc_target_allele,
"_binder_1.tsv"),
col_names = FALSE)
single_cell_data %>%
filter(binder == 1) %>%
sample_n(size, replace = TRUE) %>%
select(CDR3) %>%
write_tsv(path = str_c("data/query_10x_TCR_n_",
size,
"_",
target_peptide,
"_",
sc_target_allele,
"_binder_1.tsv"),
col_names = FALSE)
single_cell_data %>%
filter(binder == 0) %>%
sample_n(size, replace = TRUE) %>%
select(CDR3) %>%
write_tsv(path = str_c("data/query_10x_TCR_n_",
size,
"_",
target_peptide,
"_",
sc_target_allele,
"_binder_0.tsv"),
col_names = FALSE)
Read in the MAIT Natch Similarity Scores
MAIT_file_0 <- "data/MAIT_match_0.txt"
MAIT_file_1 <- "data/MAIT_match_1.txt"
#MAIT_file_0 <- "data/MAIT_match_0_no_flanks.txt"
#MAIT_file_1 <- "data/MAIT_match_1_no_flanks.txt"
mait_data <- bind_rows(
read_delim(file = MAIT_file_0, delim = " ", trim_ws = TRUE) %>%
mutate(binder = 0),
read_delim(file = MAIT_file_1, delim = " ", trim_ws = TRUE) %>%
mutate(binder = 1)) %>%
mutate(binder = factor(binder))
## Parsed with column specification:
## cols(
## Res = col_character(),
## Sequence = col_character(),
## MAIT_hit = col_character(),
## Score = col_double()
## )
## Parsed with column specification:
## cols(
## Res = col_character(),
## Sequence = col_character(),
## MAIT_hit = col_character(),
## Score = col_double()
## )
mait_data
## # A tibble: 2,000 x 5
## Res Sequence MAIT_hit Score binder
## <chr> <chr> <chr> <dbl> <fct>
## 1 Best ASSPNPGTDYGYT ASNAGSTNYGYT 0.826 0
## 2 Best ASSLPGVNLNTEAF ASSLLPGRNTEAF 0.850 0
## 3 Best ASSFLSGGLYNEQF ASSFLGGQYNEQF 0.879 0
## 4 Best ASSVDRNTEAF ASSTRSNTEAF 0.916 0
## 5 Best ASNGKTNRRPRGTGELF ASSSNRNTGELF 0.776 0
## 6 Best ASSHGLAGGPIYYNEQF ASSVYGGVSYNEQF 0.773 0
## 7 Best ASSQEPGQGYEQY ASSQDKGRAYEQY 0.868 0
## 8 Best ASSTPWGGGDEQY ASTPAGGRDEQY 0.85 0
## 9 Best ASSRGTGSPGELF ASSRTGSGELF 0.878 0
## 10 Best ASSVISGGEQY ASSIVSGDEQF 0.94 0
## # … with 1,990 more rows
Calculate Summary Statistics
mait_data %>%
group_by(binder) %>%
summarise(mu_Score = mean(Score),
sigma_Score = sd(Score),
median_Score = median(Score))
## # A tibble: 2 x 4
## binder mu_Score sigma_Score median_Score
## <fct> <dbl> <dbl> <dbl>
## 1 0 0.876 0.0404 0.880
## 2 1 0.921 0.0506 0.924
Test and Visualise MAIT Score Difference
p <- mait_data %>% wilcox.test(Score ~ binder, data = .) %>% pluck("p.value")
mait_data %>%
ggplot(aes(x = binder, y = Score, fill = binder)) +
geom_violin(alpha = 0.5, scale = "width") +
geom_boxplot(width = 0.2) +
theme_bw() +
theme(legend.position = "none") +
labs(x = str_c("10x: TCR-CDR3b::",
target_peptide,
"::",
sc_target_allele,
"binder/non-binder"),
y = "Similarity Score distributions to IEDB binder DB",
title = "MAIT-score Distributions",
subtitle = str_c("p_diff = ", signif(p,3), " (Wilcoxon Rank Test)"))
single_cell_data %>%
mutate(k = str_length(CDR3)) %>%
filter(binder == 0, k == 13) %>%
pull(CDR3) %>%
ggseqlogo +
labs(title = "10x non-binders")
single_cell_data %>%
mutate(k = str_length(CDR3)) %>%
filter(binder == 1, k == 13) %>%
pull(CDR3) %>%
ggseqlogo +
labs(title = "10x binders")