Initial commit: Chai-1 protein structure prediction pipeline for WES

- Nextflow pipeline using chai1 Docker image from Harbor
- S3-based input/output paths (s3://omic/eureka/chai-lab/)
- GPU-accelerated protein folding with MSA support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 12:55:08 +01:00
commit f971fd0e21
26 changed files with 1289 additions and 0 deletions

108
tests/test_glycans.py Executable file
View File

@@ -0,0 +1,108 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from collections import Counter
from pathlib import Path
from tempfile import TemporaryDirectory
import pytest
from chai_lab.chai1 import make_all_atom_feature_context
from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
def test_parsing_ccd_codes(ccd_code: str):
"""Test that various single CCD codes are parsed correctly."""
res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
assert len(res) == 1
def test_complex_parsing():
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
assert len(sugars) == 5
bond1, bond2, bond3, bond4 = bonds
assert bond1.src_sugar_index == 0
assert bond1.dst_sugar_index == 1
assert bond1.src_atom == 6
assert bond1.dst_atom == 1
assert bond2.src_sugar_index == 0
assert bond2.dst_sugar_index == 2
assert bond2.src_atom == 4
assert bond2.dst_atom == 1
assert bond3.src_sugar_index == 2
assert bond3.dst_sugar_index == 3
assert bond3.src_atom == 6
assert bond3.dst_atom == 1
assert bond4.src_sugar_index == 3
assert bond4.dst_sugar_index == 4
assert bond4.src_atom == 6
assert bond4.dst_atom == 1
def test_complex_parsing_2():
glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
" ", ""
)
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
assert len(sugars) == 9
expected_bonds = [
(0, 1),
(1, 2),
(1, 3),
(3, 4),
(0, 5),
(5, 6),
(6, 7),
(6, 8),
]
for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
assert bond.src_sugar_index == expected_src
assert bond.dst_sugar_index == expected_dst
def test_glycan_tokenization_with_bond():
"""Test that tokenization works, and that atoms are dropped as expected."""
glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
with TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
fasta_file = tmp_path / "input.fasta"
fasta_file.write_text(glycan)
output_dir = tmp_path / "out"
feature_context = make_all_atom_feature_context(
fasta_file,
output_dir=output_dir,
use_esm_embeddings=False, # Just a test; no need
)
# Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
# The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
assert feature_context.structure_context.atom_exists_mask.sum() == 29
# We originally constructed all atoms in dropped the atoms that leave
assert feature_context.structure_context.atom_exists_mask.numel() == 30
elements = Counter(
feature_context.structure_context.atom_ref_element[
feature_context.structure_context.atom_exists_mask
].tolist()
)
assert elements[6] == 16 # 6 = Carbon
assert elements[7] == 2 # 7 = Nitrogen
assert elements[8] == 11 # 8 = Oxygen
# Single bond feature between O and C
left, right = feature_context.structure_context.atom_covalent_bond_indices
assert left.numel() == right.numel() == 1
bond_elements = set(
[
feature_context.structure_context.atom_ref_element[left].item(),
feature_context.structure_context.atom_ref_element[right].item(),
]
)
assert bond_elements == {8, 6}