Initial commit: Chai-1 protein structure prediction pipeline for WES
- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
79
tests/test_parsing.py
Executable file
79
tests/test_parsing.py
Executable file
@@ -0,0 +1,79 @@
|
||||
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||
# Licensed under the Apache License, Version 2.0.
|
||||
# See the LICENSE file for details.
|
||||
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from chai_lab.data.parsing.fasta import read_fasta
|
||||
from chai_lab.data.parsing.input_validation import (
|
||||
constituents_of_modified_fasta,
|
||||
identify_potential_entity_types,
|
||||
)
|
||||
from chai_lab.data.parsing.structure.entity_type import EntityType
|
||||
|
||||
from .example_inputs import example_dna, example_ligands, example_proteins, example_rna
|
||||
|
||||
|
||||
def test_simple_protein_fasta():
|
||||
parts = constituents_of_modified_fasta("RKDES")
|
||||
assert parts is not None
|
||||
assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))
|
||||
|
||||
|
||||
def test_modified_protein_fasta():
|
||||
parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
|
||||
assert parts is not None
|
||||
expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
|
||||
assert all(x == y for x, y in zip(parts, expected))
|
||||
|
||||
|
||||
def test_rna_fasta():
|
||||
seq = "ACUGACG"
|
||||
parts = constituents_of_modified_fasta(seq)
|
||||
assert parts is not None
|
||||
assert all(x == y for x, y in zip(parts, seq))
|
||||
|
||||
|
||||
def test_dna_fasta():
|
||||
seq = "ACGACTAGCAT"
|
||||
parts = constituents_of_modified_fasta(seq)
|
||||
assert parts is not None
|
||||
assert all(x == y for x, y in zip(parts, seq))
|
||||
|
||||
|
||||
def test_parsing():
|
||||
for ligand in example_ligands:
|
||||
assert EntityType.LIGAND in identify_potential_entity_types(ligand)
|
||||
|
||||
for protein in example_proteins:
|
||||
assert EntityType.PROTEIN in identify_potential_entity_types(protein)
|
||||
|
||||
for dna in example_dna:
|
||||
assert EntityType.DNA in identify_potential_entity_types(dna)
|
||||
|
||||
for rna in example_rna:
|
||||
assert EntityType.RNA in identify_potential_entity_types(rna)
|
||||
|
||||
|
||||
def test_fasta_parsing():
|
||||
test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
fa_file = Path(tmpdir) / "test.fasta"
|
||||
fa_file.write_text(test_string)
|
||||
records = read_fasta(fa_file)
|
||||
|
||||
assert len(records) == 2
|
||||
assert records[0].header == "foo"
|
||||
assert records[0].sequence == "RKDES"
|
||||
assert records[1].header == "bar"
|
||||
assert records[1].sequence == "KEDESRRR"
|
||||
|
||||
|
||||
def test_smiles_parsing():
|
||||
smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
fa_file = Path(tmpdir) / "test.fasta"
|
||||
fa_file.write_text(smiles)
|
||||
records = read_fasta(fa_file)
|
||||
assert len(records) == 1
|
||||
Reference in New Issue
Block a user