Files
chai-lab/tests/test_msa_a3m_tokenization.py
Olamide Isreal f971fd0e21 Initial commit: Chai-1 protein structure prediction pipeline for WES
- Nextflow pipeline using chai1 Docker image from Harbor
- S3-based input/output paths (s3://omic/eureka/chai-lab/)
- GPU-accelerated protein folding with MSA support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 12:55:08 +01:00

37 lines
1019 B
Python
Executable File

# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
"""
Test for tokenization
"""
import numpy as np
from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
def test_tokenization_basic():
test_sequence = "RKDES"
out, dels = tokenize_sequences_to_arrays([test_sequence])
assert out.shape == dels.shape == (1, 5)
assert np.all(
out
== np.array(
[residue_types_with_nucleotides_order[res] for res in test_sequence]
)
)
def test_tokenization_with_insertion():
"""Insertions (lower case) should be ignored."""
test_sequence = "RKDES"
test_with_ins = "RKrkdesDES"
out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
assert out.shape == dels.shape == (2, 5)
assert np.all(out[0] == out[1])
assert dels.sum() == 5
assert dels[1, 2] == 5