Skip to content

Commit

Permalink
Merge pull request #39 from jonathanking/dev
Browse files Browse the repository at this point in the history
Create a fix for Issue #38.
  • Loading branch information
jonathanking authored Nov 3, 2021
2 parents ec5b5b6 + a5293c3 commit f76150d
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
5 changes: 5 additions & 0 deletions sidechainnet/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
import tqdm

import sidechainnet as scn
from sidechainnet.create import format_sidechainnet_path
from sidechainnet.dataloaders.collate import prepare_dataloaders

Expand Down Expand Up @@ -261,6 +262,10 @@ def load(casp_version=12,
local_path = _download_sidechainnet(casp_version, thinning, scn_dir)

scn_dict = _load_dict(local_path)

# Patch for removing 1GJJ_1_A, see Issue #38
scn_dict = scn.utils.manual_adjustment._repair_1GJJ_1_A(scn_dict)

scn_dict = filter_dictionary_by_resolution(scn_dict, threshold=filter_by_resolution)
if complete_structures_only:
scn_dict = filter_dictionary_by_missing_residues(scn_dict)
Expand Down
41 changes: 41 additions & 0 deletions sidechainnet/utils/manual_adjustment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from sidechainnet.structure.build_info import NUM_COORDS_PER_RES
from sidechainnet.utils.align import binary_mask_to_str
import sidechainnet as scn


def manually_correct_mask(pnid, pn_entry, mask):
Expand Down Expand Up @@ -56,3 +57,43 @@ def manually_adjust_data(pnid, sc_entry):
sc_entry["crd"] = sc_entry["crd"][:-NUM_COORDS_PER_RES * 2]

return sc_entry


def _repair_1GJJ_1_A(datadict):
"""Repair raw Sidechainnet data dictionary by splitting entry 1GJJ_1_A into two.
The file uploaded to RCSB PDB contains two overlapping domains so a manual adjustment
is required. See https://github.com/jonathanking/sidechainnet/issues/38 for more
details.
"""
# Locate positions in data arrays
found_splits_indices = []
for split in scn.DATA_SPLITS:
for idx, cur_id in enumerate(datadict[split]["ids"]):
if cur_id == '1GJJ_1_A':
found_splits_indices.append((split, idx))

# Carefully split into two entries containing the appropriate data ranges
for split, idx in found_splits_indices:
for key in datadict[split].keys():
if key == 'res':
datadict[split][key].append(datadict[split][key][idx])
elif key == 'ids':
datadict[split][key].append(datadict[split][key][idx] + "2")
print(datadict[split][key][-1])
datadict[split][key][idx] = datadict[split][key][idx] + "1"
elif key == 'ums':
datadict[split][key].append(" ".join(
datadict[split][key][idx].split()[110:153]))
datadict[split][key][idx] = " ".join(
datadict[split][key][idx].split()[0:50])
elif key == 'crd':
datadict[split][key].append(
datadict[split][key][idx][110 * NUM_COORDS_PER_RES:153 *
NUM_COORDS_PER_RES])
datadict[split][key][idx] = datadict[split][key][idx][
0 * NUM_COORDS_PER_RES:50 * NUM_COORDS_PER_RES]
else:
datadict[split][key].append(datadict[split][key][idx][110:153])
datadict[split][key][idx] = datadict[split][key][idx][0:50]
return datadict

0 comments on commit f76150d

Please sign in to comment.