Load Libraries

library("tidyverse")
library("ggseqlogo")

Load and Clean Data

Set Analysis Parameters

target_peptide <- "GILGFVFTL"
iedb_file <- "data/tcell_receptor_table_export_1578653238.csv"
single_cell_file <- "data/vdj_v1_hs_aggregated_donor1_binarized_matrix.csv"
size <- 1000
sc_target_allele <- "A0201"

IEDB Data

iedb_data <- iedb_file %>%
  read_csv %>%
  select(peptide = Description, CDR3 = `Chain 2 CDR3 Curated`) %>%
  drop_na %>%
  filter(peptide == target_peptide) %>% 
  filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  mutate(CDR3 = case_when(str_detect(CDR3, "^C.+[FW]$") ~ str_sub(CDR3, 2, -2),
                          str_detect(CDR3, "^C") ~ str_sub(CDR3, 2),
                          str_detect(CDR3, "C.+[FW]G.{1}G") ~ str_match(CDR3, "C(.+)[FW]G.{1}G")[,2],
                          TRUE ~ CDR3)) %>% 
  distinct
iedb_data
## # A tibble: 4,221 x 2
##    peptide   CDR3        
##    <chr>     <chr>       
##  1 GILGFVFTL ASSSRSSYEQY 
##  2 GILGFVFTL ASSIRSSYEQY 
##  3 GILGFVFTL ASSLIYPGELF 
##  4 GILGFVFTL ASSIGVYGYT  
##  5 GILGFVFTL ASSIRAAETQY 
##  6 GILGFVFTL ASSTFGANVLT 
##  7 GILGFVFTL ASSLLGGWSEAF
##  8 GILGFVFTL ASSIRSAYEQY 
##  9 GILGFVFTL ASSIRSSTEAF 
## 10 GILGFVFTL ASSRRSSDEQY 
## # … with 4,211 more rows

10x Data

single_cell_data <- single_cell_file %>% 
  read_csv %>% 
  mutate(CDR3 = cell_clono_cdr3_aa %>%
           str_match("TRB:([ARNDCQEGHILKMFPSTWYV]+)") %>%
           .[,2] %>% 
           str_sub(2, -2)) %>%
  select(CDR3, contains(sc_target_allele)) %>%
  select(CDR3, contains(target_peptide)) %>%
  pivot_longer(cols = -c("CDR3"),
               names_to = "peptide",
               values_to = "binder") %>%
  mutate(peptide = peptide %>% str_split("_") %>% map(2) %>% unlist,
         binder = case_when(binder > 0 ~ 1,
                            TRUE ~ 0)) %>% 
  distinct %>%
  filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  group_by(CDR3) %>% 
  filter(n() == 1) %>% 
  ungroup
single_cell_data
## # A tibble: 23,380 x 3
##    CDR3           peptide   binder
##    <chr>          <chr>      <dbl>
##  1 ASDTPVGQF      GILGFVFTL      0
##  2 ASSGGSISTDTQY  GILGFVFTL      0
##  3 ASSQDPAGGYNEQF GILGFVFTL      0
##  4 ASAVGLTYNEQF   GILGFVFTL      0
##  5 ASSRPSTGENYGYT GILGFVFTL      0
##  6 ASSPEAYEQY     GILGFVFTL      0
##  7 ASINRGRSSYNEQF GILGFVFTL      0
##  8 ASSLEPGLPAGELF GILGFVFTL      0
##  9 ASSETSRNTEAF   GILGFVFTL      0
## 10 ASSLGTVGYGYT   GILGFVFTL      0
## # … with 23,370 more rows
single_cell_data %>% 
  count(binder)
## # A tibble: 2 x 2
##   binder     n
##    <dbl> <int>
## 1      0 23118
## 2      1   262

Write data to file

iedb_data %>%
  select(CDR3) %>%
  sample_n(size, replace = TRUE) %>%
  write_tsv(path = str_c("data/database_iedb_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_1.tsv"),
            col_names = FALSE)
single_cell_data %>%
  filter(binder == 1) %>% 
  sample_n(size, replace = TRUE) %>%
  select(CDR3) %>%
  write_tsv(path = str_c("data/query_10x_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_1.tsv"),
            col_names = FALSE)
single_cell_data %>%
  filter(binder == 0) %>% 
  sample_n(size, replace = TRUE) %>%
  select(CDR3) %>%
  write_tsv(path = str_c("data/query_10x_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_0.tsv"),
            col_names = FALSE)

Read in the MAIT Natch Similarity Scores

MAIT_file_0 <- "data/MAIT_match_0.txt"
MAIT_file_1 <- "data/MAIT_match_1.txt"
#MAIT_file_0 <- "data/MAIT_match_0_no_flanks.txt"
#MAIT_file_1 <- "data/MAIT_match_1_no_flanks.txt"
mait_data <- bind_rows(
  read_delim(file = MAIT_file_0, delim = " ", trim_ws = TRUE) %>% 
    mutate(binder = 0),
    read_delim(file = MAIT_file_1, delim = " ", trim_ws = TRUE) %>% 
    mutate(binder = 1)) %>% 
  mutate(binder = factor(binder))
## Parsed with column specification:
## cols(
##   Res = col_character(),
##   Sequence = col_character(),
##   MAIT_hit = col_character(),
##   Score = col_double()
## )
## Parsed with column specification:
## cols(
##   Res = col_character(),
##   Sequence = col_character(),
##   MAIT_hit = col_character(),
##   Score = col_double()
## )
mait_data
## # A tibble: 2,000 x 5
##    Res   Sequence          MAIT_hit       Score binder
##    <chr> <chr>             <chr>          <dbl> <fct> 
##  1 Best  ASSPNPGTDYGYT     ASNAGSTNYGYT   0.826 0     
##  2 Best  ASSLPGVNLNTEAF    ASSLLPGRNTEAF  0.850 0     
##  3 Best  ASSFLSGGLYNEQF    ASSFLGGQYNEQF  0.879 0     
##  4 Best  ASSVDRNTEAF       ASSTRSNTEAF    0.916 0     
##  5 Best  ASNGKTNRRPRGTGELF ASSSNRNTGELF   0.776 0     
##  6 Best  ASSHGLAGGPIYYNEQF ASSVYGGVSYNEQF 0.773 0     
##  7 Best  ASSQEPGQGYEQY     ASSQDKGRAYEQY  0.868 0     
##  8 Best  ASSTPWGGGDEQY     ASTPAGGRDEQY   0.85  0     
##  9 Best  ASSRGTGSPGELF     ASSRTGSGELF    0.878 0     
## 10 Best  ASSVISGGEQY       ASSIVSGDEQF    0.94  0     
## # … with 1,990 more rows

Calculate Summary Statistics

mait_data %>%
  group_by(binder) %>%
  summarise(mu_Score = mean(Score),
            sigma_Score = sd(Score),
            median_Score = median(Score))
## # A tibble: 2 x 4
##   binder mu_Score sigma_Score median_Score
##   <fct>     <dbl>       <dbl>        <dbl>
## 1 0         0.876      0.0404        0.880
## 2 1         0.921      0.0506        0.924

Test and Visualise MAIT Score Difference

p <- mait_data %>% wilcox.test(Score ~ binder, data = .) %>% pluck("p.value")
mait_data %>% 
  ggplot(aes(x = binder, y = Score, fill = binder)) +
  geom_violin(alpha = 0.5, scale = "width") +
  geom_boxplot(width = 0.2) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(x = str_c("10x: TCR-CDR3b::",
                 target_peptide,
                 "::",
                 sc_target_allele,
                 "binder/non-binder"),
       y = "Similarity Score distributions to IEDB binder DB",
       title = "MAIT-score Distributions",
       subtitle = str_c("p_diff = ", signif(p,3), " (Wilcoxon Rank Test)"))

single_cell_data %>%
  mutate(k = str_length(CDR3)) %>%
  filter(binder == 0, k == 13) %>%
  pull(CDR3) %>%
  ggseqlogo +
  labs(title = "10x non-binders")

single_cell_data %>%
  mutate(k = str_length(CDR3)) %>%
  filter(binder == 1, k == 13) %>%
  pull(CDR3) %>%
  ggseqlogo +
  labs(title = "10x binders")