Load Libraries

library("tidyverse")
library("ggseqlogo")

Load and Clean Data

Set Analysis Parameters

target_peptide <- "GILGFVFTL"
iedb_file <- "data/tcell_receptor_table_export_1578653238.csv"
single_cell_file <- "data/vdj_v1_hs_aggregated_donor1_binarized_matrix.csv"
size <- 1000
sc_target_allele <- "A0201"

IEDB Data

iedb_data <- iedb_file %>%
  read_csv %>%
  select(peptide = Description, CDR3 = `Chain 2 CDR3 Curated`) %>%
  drop_na %>%
  filter(peptide == target_peptide) %>% 
  filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  mutate(CDR3 = case_when(str_detect(CDR3, "^C.+[FW]$") ~ str_sub(CDR3, 2, -2),
                          str_detect(CDR3, "^C") ~ str_sub(CDR3, 2),
                          str_detect(CDR3, "C.+[FW]G.{1}G") ~ str_match(CDR3, "C(.+)[FW]G.{1}G")[,2],
                          TRUE ~ CDR3)) %>% 
  distinct
iedb_data

## # A tibble: 4,221 x 2
##    peptide   CDR3        
##    <chr>     <chr>       
##  1 GILGFVFTL ASSSRSSYEQY 
##  2 GILGFVFTL ASSIRSSYEQY 
##  3 GILGFVFTL ASSLIYPGELF 
##  4 GILGFVFTL ASSIGVYGYT  
##  5 GILGFVFTL ASSIRAAETQY 
##  6 GILGFVFTL ASSTFGANVLT 
##  7 GILGFVFTL ASSLLGGWSEAF
##  8 GILGFVFTL ASSIRSAYEQY 
##  9 GILGFVFTL ASSIRSSTEAF 
## 10 GILGFVFTL ASSRRSSDEQY 
## # … with 4,211 more rows

10x Data

single_cell_data <- single_cell_file %>% 
  read_csv %>% 
  mutate(CDR3 = cell_clono_cdr3_aa %>%
           str_match("TRB:([ARNDCQEGHILKMFPSTWYV]+)") %>%
           .[,2] %>% 
           str_sub(2, -2)) %>%
  select(CDR3, contains(sc_target_allele)) %>%
  select(CDR3, contains(target_peptide)) %>%
  pivot_longer(cols = -c("CDR3"),
               names_to = "peptide",
               values_to = "binder") %>%
  mutate(peptide = peptide %>% str_split("_") %>% map(2) %>% unlist,
         binder = case_when(binder > 0 ~ 1,
                            TRUE ~ 0)) %>% 
  distinct %>%
  filter(str_detect(CDR3, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  filter(str_detect(peptide, "[^ARNDCQEGHILKMFPSTWYV]", negate = TRUE)) %>% 
  group_by(CDR3) %>% 
  filter(n() == 1) %>% 
  ungroup
single_cell_data

## # A tibble: 23,380 x 3
##    CDR3           peptide   binder
##    <chr>          <chr>      <dbl>
##  1 ASDTPVGQF      GILGFVFTL      0
##  2 ASSGGSISTDTQY  GILGFVFTL      0
##  3 ASSQDPAGGYNEQF GILGFVFTL      0
##  4 ASAVGLTYNEQF   GILGFVFTL      0
##  5 ASSRPSTGENYGYT GILGFVFTL      0
##  6 ASSPEAYEQY     GILGFVFTL      0
##  7 ASINRGRSSYNEQF GILGFVFTL      0
##  8 ASSLEPGLPAGELF GILGFVFTL      0
##  9 ASSETSRNTEAF   GILGFVFTL      0
## 10 ASSLGTVGYGYT   GILGFVFTL      0
## # … with 23,370 more rows

single_cell_data %>% 
  count(binder)

## # A tibble: 2 x 2
##   binder     n
##    <dbl> <int>
## 1      0 23118
## 2      1   262

Write data to file

iedb_data %>%
  select(CDR3) %>%
  sample_n(size, replace = TRUE) %>%
  write_tsv(path = str_c("data/database_iedb_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_1.tsv"),
            col_names = FALSE)
single_cell_data %>%
  filter(binder == 1) %>% 
  sample_n(size, replace = TRUE) %>%
  select(CDR3) %>%
  write_tsv(path = str_c("data/query_10x_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_1.tsv"),
            col_names = FALSE)
single_cell_data %>%
  filter(binder == 0) %>% 
  sample_n(size, replace = TRUE) %>%
  select(CDR3) %>%
  write_tsv(path = str_c("data/query_10x_TCR_n_",
                         size,
                         "_",
                         target_peptide,
                         "_",
                         sc_target_allele,
                         "_binder_0.tsv"),
            col_names = FALSE)

Read in the MAIT Natch Similarity Scores

MAIT_file_0 <- "data/MAIT_match_0.txt"
MAIT_file_1 <- "data/MAIT_match_1.txt"
#MAIT_file_0 <- "data/MAIT_match_0_no_flanks.txt"
#MAIT_file_1 <- "data/MAIT_match_1_no_flanks.txt"
mait_data <- bind_rows(
  read_delim(file = MAIT_file_0, delim = " ", trim_ws = TRUE) %>% 
    mutate(binder = 0),
    read_delim(file = MAIT_file_1, delim = " ", trim_ws = TRUE) %>% 
    mutate(binder = 1)) %>% 
  mutate(binder = factor(binder))

## Parsed with column specification:
## cols(
##   Res = col_character(),
##   Sequence = col_character(),
##   MAIT_hit = col_character(),
##   Score = col_double()
## )
## Parsed with column specification:
## cols(
##   Res = col_character(),
##   Sequence = col_character(),
##   MAIT_hit = col_character(),
##   Score = col_double()
## )

mait_data

## # A tibble: 2,000 x 5
##    Res   Sequence          MAIT_hit       Score binder
##    <chr> <chr>             <chr>          <dbl> <fct> 
##  1 Best  ASSPNPGTDYGYT     ASNAGSTNYGYT   0.826 0     
##  2 Best  ASSLPGVNLNTEAF    ASSLLPGRNTEAF  0.850 0     
##  3 Best  ASSFLSGGLYNEQF    ASSFLGGQYNEQF  0.879 0     
##  4 Best  ASSVDRNTEAF       ASSTRSNTEAF    0.916 0     
##  5 Best  ASNGKTNRRPRGTGELF ASSSNRNTGELF   0.776 0     
##  6 Best  ASSHGLAGGPIYYNEQF ASSVYGGVSYNEQF 0.773 0     
##  7 Best  ASSQEPGQGYEQY     ASSQDKGRAYEQY  0.868 0     
##  8 Best  ASSTPWGGGDEQY     ASTPAGGRDEQY   0.85  0     
##  9 Best  ASSRGTGSPGELF     ASSRTGSGELF    0.878 0     
## 10 Best  ASSVISGGEQY       ASSIVSGDEQF    0.94  0     
## # … with 1,990 more rows

Calculate Summary Statistics

mait_data %>%
  group_by(binder) %>%
  summarise(mu_Score = mean(Score),
            sigma_Score = sd(Score),
            median_Score = median(Score))

## # A tibble: 2 x 4
##   binder mu_Score sigma_Score median_Score
##   <fct>     <dbl>       <dbl>        <dbl>
## 1 0         0.876      0.0404        0.880
## 2 1         0.921      0.0506        0.924

Test and Visualise MAIT Score Difference

p <- mait_data %>% wilcox.test(Score ~ binder, data = .) %>% pluck("p.value")
mait_data %>% 
  ggplot(aes(x = binder, y = Score, fill = binder)) +
  geom_violin(alpha = 0.5, scale = "width") +
  geom_boxplot(width = 0.2) +
  theme_bw() +
  theme(legend.position = "none") +
  labs(x = str_c("10x: TCR-CDR3b::",
                 target_peptide,
                 "::",
                 sc_target_allele,
                 "binder/non-binder"),
       y = "Similarity Score distributions to IEDB binder DB",
       title = "MAIT-score Distributions",
       subtitle = str_c("p_diff = ", signif(p,3), " (Wilcoxon Rank Test)"))

single_cell_data %>%
  mutate(k = str_length(CDR3)) %>%
  filter(binder == 0, k == 13) %>%
  pull(CDR3) %>%
  ggseqlogo +
  labs(title = "10x non-binders")

single_cell_data %>%
  mutate(k = str_length(CDR3)) %>%
  filter(binder == 1, k == 13) %>%
  pull(CDR3) %>%
  ggseqlogo +
  labs(title = "10x binders")

Single Cell Technologies for Identifying T-Cell Epitopes

Leon Eyrich Jessen

Monday January 13th 2020

Load Libraries

Load and Clean Data

Set Analysis Parameters

IEDB Data

10x Data

Write data to file

Read in the MAIT Natch Similarity Scores

Calculate Summary Statistics

Test and Visualise MAIT Score Difference