chai-lab/tests/test_glycans.py

# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from collections import Counter
from pathlib import Path
from tempfile import TemporaryDirectory

import pytest

from chai_lab.chai1 import make_all_atom_feature_context
from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds


@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
def test_parsing_ccd_codes(ccd_code: str):
    """Test that various single CCD codes are parsed correctly."""
    res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
    assert len(res) == 1


def test_complex_parsing():
    glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
    assert len(sugars) == 5

    bond1, bond2, bond3, bond4 = bonds

    assert bond1.src_sugar_index == 0
    assert bond1.dst_sugar_index == 1
    assert bond1.src_atom == 6
    assert bond1.dst_atom == 1
    assert bond2.src_sugar_index == 0
    assert bond2.dst_sugar_index == 2
    assert bond2.src_atom == 4
    assert bond2.dst_atom == 1
    assert bond3.src_sugar_index == 2
    assert bond3.dst_sugar_index == 3
    assert bond3.src_atom == 6
    assert bond3.dst_atom == 1
    assert bond4.src_sugar_index == 3
    assert bond4.dst_sugar_index == 4
    assert bond4.src_atom == 6
    assert bond4.dst_atom == 1


def test_complex_parsing_2():
    glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
        " ", ""
    )
    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
    assert len(sugars) == 9

    expected_bonds = [
        (0, 1),
        (1, 2),
        (1, 3),
        (3, 4),
        (0, 5),
        (5, 6),
        (6, 7),
        (6, 8),
    ]
    for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
        assert bond.src_sugar_index == expected_src
        assert bond.dst_sugar_index == expected_dst


def test_glycan_tokenization_with_bond():
    """Test that tokenization works, and that atoms are dropped as expected."""
    glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
    with TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir)

        fasta_file = tmp_path / "input.fasta"
        fasta_file.write_text(glycan)

        output_dir = tmp_path / "out"

        feature_context = make_all_atom_feature_context(
            fasta_file,
            output_dir=output_dir,
            use_esm_embeddings=False,  # Just a test; no need
        )

    # Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
    # The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
    assert feature_context.structure_context.atom_exists_mask.sum() == 29
    # We originally constructed all atoms in dropped the atoms that leave
    assert feature_context.structure_context.atom_exists_mask.numel() == 30
    elements = Counter(
        feature_context.structure_context.atom_ref_element[
            feature_context.structure_context.atom_exists_mask
        ].tolist()
    )
    assert elements[6] == 16  # 6 = Carbon
    assert elements[7] == 2  # 7 = Nitrogen
    assert elements[8] == 11  # 8 = Oxygen

    # Single bond feature between O and C
    left, right = feature_context.structure_context.atom_covalent_bond_indices
    assert left.numel() == right.numel() == 1
    bond_elements = set(
        [
            feature_context.structure_context.atom_ref_element[left].item(),
            feature_context.structure_context.atom_ref_element[right].item(),
        ]
    )
    assert bond_elements == {8, 6}