Files
chai-lab/tests/test_parsing.py
Olamide Isreal f971fd0e21 Initial commit: Chai-1 protein structure prediction pipeline for WES
- Nextflow pipeline using chai1 Docker image from Harbor
- S3-based input/output paths (s3://omic/eureka/chai-lab/)
- GPU-accelerated protein folding with MSA support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 12:55:08 +01:00

80 lines
2.5 KiB
Python
Executable File

# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from pathlib import Path
from tempfile import TemporaryDirectory
from chai_lab.data.parsing.fasta import read_fasta
from chai_lab.data.parsing.input_validation import (
constituents_of_modified_fasta,
identify_potential_entity_types,
)
from chai_lab.data.parsing.structure.entity_type import EntityType
from .example_inputs import example_dna, example_ligands, example_proteins, example_rna
def test_simple_protein_fasta():
parts = constituents_of_modified_fasta("RKDES")
assert parts is not None
assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))
def test_modified_protein_fasta():
parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
assert parts is not None
expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
assert all(x == y for x, y in zip(parts, expected))
def test_rna_fasta():
seq = "ACUGACG"
parts = constituents_of_modified_fasta(seq)
assert parts is not None
assert all(x == y for x, y in zip(parts, seq))
def test_dna_fasta():
seq = "ACGACTAGCAT"
parts = constituents_of_modified_fasta(seq)
assert parts is not None
assert all(x == y for x, y in zip(parts, seq))
def test_parsing():
for ligand in example_ligands:
assert EntityType.LIGAND in identify_potential_entity_types(ligand)
for protein in example_proteins:
assert EntityType.PROTEIN in identify_potential_entity_types(protein)
for dna in example_dna:
assert EntityType.DNA in identify_potential_entity_types(dna)
for rna in example_rna:
assert EntityType.RNA in identify_potential_entity_types(rna)
def test_fasta_parsing():
test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
with TemporaryDirectory() as tmpdir:
fa_file = Path(tmpdir) / "test.fasta"
fa_file.write_text(test_string)
records = read_fasta(fa_file)
assert len(records) == 2
assert records[0].header == "foo"
assert records[0].sequence == "RKDES"
assert records[1].header == "bar"
assert records[1].sequence == "KEDESRRR"
def test_smiles_parsing():
smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
with TemporaryDirectory() as tmpdir:
fa_file = Path(tmpdir) / "test.fasta"
fa_file.write_text(smiles)
records = read_fasta(fa_file)
assert len(records) == 1