Skip to content

Commit

Permalink
Allow R2 adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Mar 26, 2024
1 parent fb5aaf6 commit 94954c6
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 22 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,10 @@ bean-count-samples \
-r `# read edit/allele information from reporter` \
-t 12 `# number of threads` \
--name my_sorting_screen `# name of this sample run` \
--guide-start-seq ATGCTTGCC `# Change this according to your read design
```

You may need to adjust the command arguments according to your read structure.
<img src="imgs/read_struct.png" alt="Read structuren" width="600"/>
By default, `bean-count[-samples]` assume R1 and R2 are trimmed off of the adapter sequence. You may need to adjust the command arguments according to your read structure.
<img src="imgs/sequence_struct.png" alt="Read structuren" width="600"/>
See full detail in [Parameters](#parameters).

### Input file format
Expand Down Expand Up @@ -149,8 +148,9 @@ Output formats


Read structure
* `--guide-start-seq`: Guide starts after this sequence in R1 (default: '')
* `--guide-end-seq`: Guide ends after this sequence in R1 (default: '')
* `--guide-start-seq` (default: ''): Guide starts after this sequence in R1
* `--guide-end-seq` (default: ''): Guide ends after this sequence in R1
* `--barcode-start-seq` (default: ''): Barcode + reporter starts after this sequence in R2, denoted as the sense direction (the same sequence direction as R1).
* `--guide-start-seqs-file` (default: `None`): CSV file path with per-sample `guide_start_seq` to be used, if provided. Formatted as `sample_id, guide_start_seq`
* `--guide-end-seqs-file` (default: `None`): CSV file path with per-sample `guide_end_seq` to be used, if provided. Formatted as `sample_id,guide_end_seq`
* `-l`, `--reporter-length` (default: `32`): Length of the reporter sequence.
Expand Down
32 changes: 21 additions & 11 deletions bean/mapping/GuideEditCounter.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def __init__(self, **kwargs):

self.guide_start_seq = kwargs["guide_start_seq"].upper()
self.guide_end_seq = kwargs["guide_end_seq"].upper()
self.barcode_start_seq = kwargs["barcode_start_seq"].upper()
if not self.guide_start_seq == "":
info(
f"{self.name}: Using guide_start_seq={self.guide_start_seq} for {self.output_dir}"
Expand Down Expand Up @@ -438,6 +439,7 @@ def _count_reporter_edits(
matched_guide_idx: int,
R1_seq: str,
R2_record: SeqIO.SeqRecord,
R2_start: int = 0,
single_base_qual_cutoff: str = 30,
guide_allele: Allele = None,
):
Expand All @@ -453,7 +455,7 @@ def _count_reporter_edits(
guide_allele: Allele from baseedit in gRNA spacer sequence when paired with guide allele.
"""
ref_reporter_seq = self.screen.guides.reporter.iloc[matched_guide_idx]
read_reporter_seq, read_reporter_qual = self.get_reporter_seq_qual(R2_record)
read_reporter_seq, read_reporter_qual = self.get_reporter_seq_qual(R2_record, R2_start)

guide_strand, offset = self._get_strand_offset_from_guide_index(
matched_guide_idx
Expand Down Expand Up @@ -514,10 +516,9 @@ def _get_guide_counts_bcmatch_semimatch(
R1_seq = str(r1.seq)
R2_seq = str(r2.seq)

bc_match, semimatch = self._match_read_to_sgRNA_bcmatch_semimatch(
bc_match, semimatch, R2_start = self._match_read_to_sgRNA_bcmatch_semimatch(
R1_seq, R2_seq
)

if len(bc_match) == 0:
if len(semimatch) == 0: # no guide match
if self.keep_intermediate:
Expand Down Expand Up @@ -555,10 +556,10 @@ def _get_guide_counts_bcmatch_semimatch(
# TODO: what if reporter seq doesn't match barcode & guide?
if self.count_guide_reporter_alleles:
self._count_reporter_edits(
matched_guide_idx, R1_seq, r2, guide_allele=guide_allele
matched_guide_idx, R1_seq, r2, R2_start = R2_start, guide_allele=guide_allele
)
else:
self._count_reporter_edits(matched_guide_idx, R1_seq, r2)
self._count_reporter_edits(matched_guide_idx, R1_seq, r2, R2_start = R2_start, )
tqdm_reads.postfix = f"n_read={self.bcmatch}"
tqdm_reads.update()

Expand Down Expand Up @@ -643,19 +644,28 @@ def get_reporter_seq(self, R1_seq, R2_seq):
R2_seq[self.guide_bc_len : (self.guide_bc_len + self.reporter_length)]
)

def get_reporter_seq_qual(self, R2_record: SeqIO.SeqRecord):
def get_reporter_seq_qual(self, R2_record: SeqIO.SeqRecord, R2_start = 0):
seq = R2_record[
self.guide_bc_len : (self.guide_bc_len + self.reporter_length)
(R2_start + self.guide_bc_len) : (R2_start + self.guide_bc_len + self.reporter_length)
].reverse_complement()
return (str(seq.seq), seq.letter_annotations["phred_quality"])

def get_barcode(self, R1_seq, R2_seq):
"""This can be edited by user based on the read construct."""
return revcomp(R2_seq[: self.guide_bc_len])
if self.barcode_start_seq != "":
barcode_start_idx = revcomp(R2_seq).replace(
self.base_edited_from, self.base_edited_to
).find(
self.barcode_start_seq.replace(self.base_edited_from, self.base_edited_to)
)
if barcode_start_idx == -1:
return -1, ""
else:
barcode_start_idx = 0
return barcode_start_idx, revcomp(R2_seq[barcode_start_idx: self.guide_bc_len])

def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str):
# This should be adjusted for each experimental recipes.
guide_barcode = self.get_barcode(R1_seq, R2_seq)
bc_start_idx, guide_barcode = self.get_barcode(R1_seq, R2_seq)
bc_match_idx = np.array([])
semimatch_idx = np.array([])
for guide_length in self.guide_lengths:
Expand All @@ -677,7 +687,7 @@ def _match_read_to_sgRNA_bcmatch_semimatch(self, R1_seq: str, R2_seq: str):
)
semimatch_idx = np.append(semimatch_idx, _seq_match)

return (bc_match_idx.astype(int), semimatch_idx.astype(int))
return bc_match_idx.astype(int), semimatch_idx.astype(int), bc_start_idx

def _get_guide_position_seq_of_read(self, seq):
guide_start_idx = self._get_guide_start_idx(seq)
Expand Down
6 changes: 6 additions & 0 deletions bean/mapping/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ def _get_input_parser():
help="Guide starts after this sequence in R1",
default="",
)
parser.add_argument(
"--barcode-start-seq",
type=str,
help="Barcode + reporter starts after this sequence in R2, denoted as the sense direction (the same sequence direction as R1).",
default="",
)
parser.add_argument(
"-r", "--count-reporter", help="Count reporter edits.", action="store_true"
)
Expand Down
24 changes: 20 additions & 4 deletions bin/bean-count-samples
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ def get_input_parser() -> argparse.Namespace:
+ "Formatted as `sample_id,guide_end_seq`",
default=None,
)
parser.add_argument(
"--barcode-start-seqs-file",
type=str,
help="CSV file path with per-sample `barcode_start_seq` to be used."
+ "Formatted as `sample_id,guide_end_seq`",
default=None,
)

parser.add_argument(
"--rerun", help="Recount each sample", action="store_true", default=False
Expand Down Expand Up @@ -91,6 +98,11 @@ def count_sample(R1: str, R2: str, sample_id: str, args: argparse.Namespace):
and args_dict["guide_end_seqs_tbl"] is not None
):
args_dict["guide_end_seq"] = args_dict["guide_end_seqs_tbl"][sample_id]
if (
"barcode_start_seqs_tbl" in args_dict
and args_dict["barcode_start_seqs_tbl"] is not None
):
args_dict["barcode_start_seq"] = str(args_dict["barcode_start_seqs_tbl"][sample_id])
counter = bean.mp.GuideEditCounter(**args_dict)
if os.path.exists(f"{counter.output_dir}.h5ad") and not args_dict["rerun"]:
screen = bean.read_h5ad(f"{counter.output_dir}.h5ad")
Expand Down Expand Up @@ -146,7 +158,7 @@ def check_arguments(args: argparse.Namespace) -> argparse.Namespace:
for read_length in first_read_lengths:
_check_read_length(args, read_length, warn)

def _check_return_guide_seqs_tbl(guide_seqs_file, sample_tbl):
def _check_return_guide_seqs_tbl(guide_seqs_file, sample_tbl, label):
"""Check if the provided `guide_[start,end]_seqs_file` contains information about all samples in `sample_tbl`."""
guide_seqs_tbl = pd.read_csv(guide_seqs_file, header=None, dtype=str).fillna("")
if len(guide_seqs_tbl.columns) == 1:
Expand All @@ -155,18 +167,22 @@ def check_arguments(args: argparse.Namespace) -> argparse.Namespace:
sample_has_seq = sample_tbl.iloc[:, 2].isin(guide_seqs_tbl[0])
if not sample_has_seq.all():
raise InputFileError(
f"Sample ID(s) {sample_tbl.iloc[:,2][~sample_has_seq]} not in guides_seqs_file {guide_seqs_tbl}"
f"Sample ID(s) {sample_tbl.iloc[:,2][~sample_has_seq]} not in {label}_seqs_file {guide_seqs_tbl}"
)
guide_seqs_tbl.columns = ["sample", "seq"]
return guide_seqs_tbl.set_index("sample")["seq"]

if args.guide_start_seqs_file is not None:
args.guide_start_seqs_tbl = _check_return_guide_seqs_tbl(
args.guide_start_seqs_file, sample_tbl
args.guide_start_seqs_file, sample_tbl, "guide_start"
)
if args.guide_end_seqs_file is not None:
args.guide_end_seqs_tbl = _check_return_guide_seqs_tbl(
args.guide_end_seqs_file, sample_tbl
args.guide_end_seqs_file, sample_tbl, "guide_end"
)
if args.barcode_start_seqs_file is not None:
args.barcode_start_seqs_tbl = _check_return_guide_seqs_tbl(
args.barcode_start_seqs_file, sample_tbl, "barcode_start"
)
return args

Expand Down
Binary file removed imgs/read_struct.png
Binary file not shown.
Binary file added imgs/sequence_struct.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 14 additions & 2 deletions tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,21 @@ def test_count_samples():
)
except subprocess.CalledProcessError as exc:
raise exc

@pytest.mark.order(5)
def test_count_samples_bcstart():
cmd = "bean-count-samples --input tests/data/sample_list.csv -b A -f tests/data/test_guide_info.csv -o tests/test_res/ -r --barcode-start-seq=GGAA"
try:
subprocess.check_output(
cmd,
shell=True,
universal_newlines=True,
)
except subprocess.CalledProcessError as exc:
raise exc


@pytest.mark.order(5)
@pytest.mark.order(6)
def test_count_samples_tiling():
cmd = "bean-count-samples --input tests/data/sample_list_tiling.csv -b A -f tests/data/test_guide_info_tiling_chrom.csv -o tests/test_res/ -r"
try:
Expand All @@ -54,7 +66,7 @@ def test_count_samples_tiling():
raise exc


@pytest.mark.order(6)
@pytest.mark.order(7)
def test_count_chroms():
cmd = "bean-count --R1 tests/data/test_tiling_R1.fastq --R2 tests/data/test_tiling_R2.fastq -b A -f tests/data/test_guide_info_tiling_chrom.csv -o tests/test_res/ -r"
try:
Expand Down

0 comments on commit 94954c6

Please sign in to comment.