- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
109 lines
3.6 KiB
Python
Executable File
109 lines
3.6 KiB
Python
Executable File
# Copyright (c) 2024 Chai Discovery, Inc.
|
|
# Licensed under the Apache License, Version 2.0.
|
|
# See the LICENSE file for details.
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from tempfile import TemporaryDirectory
|
|
|
|
import pytest
|
|
|
|
from chai_lab.chai1 import make_all_atom_feature_context
|
|
from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
|
|
|
|
|
|
@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
|
|
def test_parsing_ccd_codes(ccd_code: str):
|
|
"""Test that various single CCD codes are parsed correctly."""
|
|
res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
|
|
assert len(res) == 1
|
|
|
|
|
|
def test_complex_parsing():
|
|
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
|
|
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
|
|
assert len(sugars) == 5
|
|
|
|
bond1, bond2, bond3, bond4 = bonds
|
|
|
|
assert bond1.src_sugar_index == 0
|
|
assert bond1.dst_sugar_index == 1
|
|
assert bond1.src_atom == 6
|
|
assert bond1.dst_atom == 1
|
|
assert bond2.src_sugar_index == 0
|
|
assert bond2.dst_sugar_index == 2
|
|
assert bond2.src_atom == 4
|
|
assert bond2.dst_atom == 1
|
|
assert bond3.src_sugar_index == 2
|
|
assert bond3.dst_sugar_index == 3
|
|
assert bond3.src_atom == 6
|
|
assert bond3.dst_atom == 1
|
|
assert bond4.src_sugar_index == 3
|
|
assert bond4.dst_sugar_index == 4
|
|
assert bond4.src_atom == 6
|
|
assert bond4.dst_atom == 1
|
|
|
|
|
|
def test_complex_parsing_2():
|
|
glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
|
|
" ", ""
|
|
)
|
|
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
|
|
assert len(sugars) == 9
|
|
|
|
expected_bonds = [
|
|
(0, 1),
|
|
(1, 2),
|
|
(1, 3),
|
|
(3, 4),
|
|
(0, 5),
|
|
(5, 6),
|
|
(6, 7),
|
|
(6, 8),
|
|
]
|
|
for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
|
|
assert bond.src_sugar_index == expected_src
|
|
assert bond.dst_sugar_index == expected_dst
|
|
|
|
|
|
def test_glycan_tokenization_with_bond():
|
|
"""Test that tokenization works, and that atoms are dropped as expected."""
|
|
glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
|
|
with TemporaryDirectory() as tmpdir:
|
|
tmp_path = Path(tmpdir)
|
|
|
|
fasta_file = tmp_path / "input.fasta"
|
|
fasta_file.write_text(glycan)
|
|
|
|
output_dir = tmp_path / "out"
|
|
|
|
feature_context = make_all_atom_feature_context(
|
|
fasta_file,
|
|
output_dir=output_dir,
|
|
use_esm_embeddings=False, # Just a test; no need
|
|
)
|
|
|
|
# Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
|
|
# The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
|
|
assert feature_context.structure_context.atom_exists_mask.sum() == 29
|
|
# We originally constructed all atoms in dropped the atoms that leave
|
|
assert feature_context.structure_context.atom_exists_mask.numel() == 30
|
|
elements = Counter(
|
|
feature_context.structure_context.atom_ref_element[
|
|
feature_context.structure_context.atom_exists_mask
|
|
].tolist()
|
|
)
|
|
assert elements[6] == 16 # 6 = Carbon
|
|
assert elements[7] == 2 # 7 = Nitrogen
|
|
assert elements[8] == 11 # 8 = Oxygen
|
|
|
|
# Single bond feature between O and C
|
|
left, right = feature_context.structure_context.atom_covalent_bond_indices
|
|
assert left.numel() == right.numel() == 1
|
|
bond_elements = set(
|
|
[
|
|
feature_context.structure_context.atom_ref_element[left].item(),
|
|
feature_context.structure_context.atom_ref_element[right].item(),
|
|
]
|
|
)
|
|
assert bond_elements == {8, 6}
|