Initial commit: Chai-1 protein structure prediction pipeline for WES

- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 12:55:08 +01:00
commit f971fd0e21
26 changed files with 1289 additions and 0 deletions
--- a/tests/test_glycans.py
+++ b/tests/test_glycans.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2024 Chai Discovery, Inc.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for details.
+from collections import Counter
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from chai_lab.chai1 import make_all_atom_feature_context
+from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
+
+
+@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
+def test_parsing_ccd_codes(ccd_code: str):
+    """Test that various single CCD codes are parsed correctly."""
+    res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
+    assert len(res) == 1
+
+
+def test_complex_parsing():
+    glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
+    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
+    assert len(sugars) == 5
+
+    bond1, bond2, bond3, bond4 = bonds
+
+    assert bond1.src_sugar_index == 0
+    assert bond1.dst_sugar_index == 1
+    assert bond1.src_atom == 6
+    assert bond1.dst_atom == 1
+    assert bond2.src_sugar_index == 0
+    assert bond2.dst_sugar_index == 2
+    assert bond2.src_atom == 4
+    assert bond2.dst_atom == 1
+    assert bond3.src_sugar_index == 2
+    assert bond3.dst_sugar_index == 3
+    assert bond3.src_atom == 6
+    assert bond3.dst_atom == 1
+    assert bond4.src_sugar_index == 3
+    assert bond4.dst_sugar_index == 4
+    assert bond4.src_atom == 6
+    assert bond4.dst_atom == 1
+
+
+def test_complex_parsing_2():
+    glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
+        " ", ""
+    )
+    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
+    assert len(sugars) == 9
+
+    expected_bonds = [
+        (0, 1),
+        (1, 2),
+        (1, 3),
+        (3, 4),
+        (0, 5),
+        (5, 6),
+        (6, 7),
+        (6, 8),
+    ]
+    for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
+        assert bond.src_sugar_index == expected_src
+        assert bond.dst_sugar_index == expected_dst
+
+
+def test_glycan_tokenization_with_bond():
+    """Test that tokenization works, and that atoms are dropped as expected."""
+    glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
+    with TemporaryDirectory() as tmpdir:
+        tmp_path = Path(tmpdir)
+
+        fasta_file = tmp_path / "input.fasta"
+        fasta_file.write_text(glycan)
+
+        output_dir = tmp_path / "out"
+
+        feature_context = make_all_atom_feature_context(
+            fasta_file,
+            output_dir=output_dir,
+            use_esm_embeddings=False,  # Just a test; no need
+        )
+
+    # Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
+    # The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
+    assert feature_context.structure_context.atom_exists_mask.sum() == 29
+    # We originally constructed all atoms in dropped the atoms that leave
+    assert feature_context.structure_context.atom_exists_mask.numel() == 30
+    elements = Counter(
+        feature_context.structure_context.atom_ref_element[
+            feature_context.structure_context.atom_exists_mask
+        ].tolist()
+    )
+    assert elements[6] == 16  # 6 = Carbon
+    assert elements[7] == 2  # 7 = Nitrogen
+    assert elements[8] == 11  # 8 = Oxygen
+
+    # Single bond feature between O and C
+    left, right = feature_context.structure_context.atom_covalent_bond_indices
+    assert left.numel() == right.numel() == 1
+    bond_elements = set(
+        [
+            feature_context.structure_context.atom_ref_element[left].item(),
+            feature_context.structure_context.atom_ref_element[right].item(),
+        ]
+    )
+    assert bond_elements == {8, 6}