chai-lab/tests/test_parsing.py

# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.

from pathlib import Path
from tempfile import TemporaryDirectory

from chai_lab.data.parsing.fasta import read_fasta
from chai_lab.data.parsing.input_validation import (
    constituents_of_modified_fasta,
    identify_potential_entity_types,
)
from chai_lab.data.parsing.structure.entity_type import EntityType

from .example_inputs import example_dna, example_ligands, example_proteins, example_rna


def test_simple_protein_fasta():
    parts = constituents_of_modified_fasta("RKDES")
    assert parts is not None
    assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))


def test_modified_protein_fasta():
    parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
    assert parts is not None
    expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
    assert all(x == y for x, y in zip(parts, expected))


def test_rna_fasta():
    seq = "ACUGACG"
    parts = constituents_of_modified_fasta(seq)
    assert parts is not None
    assert all(x == y for x, y in zip(parts, seq))


def test_dna_fasta():
    seq = "ACGACTAGCAT"
    parts = constituents_of_modified_fasta(seq)
    assert parts is not None
    assert all(x == y for x, y in zip(parts, seq))


def test_parsing():
    for ligand in example_ligands:
        assert EntityType.LIGAND in identify_potential_entity_types(ligand)

    for protein in example_proteins:
        assert EntityType.PROTEIN in identify_potential_entity_types(protein)

    for dna in example_dna:
        assert EntityType.DNA in identify_potential_entity_types(dna)

    for rna in example_rna:
        assert EntityType.RNA in identify_potential_entity_types(rna)


def test_fasta_parsing():
    test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
    with TemporaryDirectory() as tmpdir:
        fa_file = Path(tmpdir) / "test.fasta"
        fa_file.write_text(test_string)
        records = read_fasta(fa_file)

    assert len(records) == 2
    assert records[0].header == "foo"
    assert records[0].sequence == "RKDES"
    assert records[1].header == "bar"
    assert records[1].sequence == "KEDESRRR"


def test_smiles_parsing():
    smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
    with TemporaryDirectory() as tmpdir:
        fa_file = Path(tmpdir) / "test.fasta"
        fa_file.write_text(smiles)
        records = read_fasta(fa_file)
    assert len(records) == 1