chai-lab/tests/test_msa_a3m_tokenization.py

# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
"""
Test for tokenization
"""

import numpy as np

from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order


def test_tokenization_basic():
    test_sequence = "RKDES"

    out, dels = tokenize_sequences_to_arrays([test_sequence])
    assert out.shape == dels.shape == (1, 5)
    assert np.all(
        out
        == np.array(
            [residue_types_with_nucleotides_order[res] for res in test_sequence]
        )
    )


def test_tokenization_with_insertion():
    """Insertions (lower case) should be ignored."""
    test_sequence = "RKDES"
    test_with_ins = "RKrkdesDES"

    out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
    assert out.shape == dels.shape == (2, 5)
    assert np.all(out[0] == out[1])
    assert dels.sum() == 5
    assert dels[1, 2] == 5