Initial commit: Chai-1 protein structure prediction pipeline for WES
- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
36
tests/test_msa_a3m_tokenization.py
Executable file
36
tests/test_msa_a3m_tokenization.py
Executable file
@@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||
# Licensed under the Apache License, Version 2.0.
|
||||
# See the LICENSE file for details.
|
||||
"""
|
||||
Test for tokenization
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
|
||||
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
|
||||
|
||||
|
||||
def test_tokenization_basic():
|
||||
test_sequence = "RKDES"
|
||||
|
||||
out, dels = tokenize_sequences_to_arrays([test_sequence])
|
||||
assert out.shape == dels.shape == (1, 5)
|
||||
assert np.all(
|
||||
out
|
||||
== np.array(
|
||||
[residue_types_with_nucleotides_order[res] for res in test_sequence]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_tokenization_with_insertion():
|
||||
"""Insertions (lower case) should be ignored."""
|
||||
test_sequence = "RKDES"
|
||||
test_with_ins = "RKrkdesDES"
|
||||
|
||||
out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
|
||||
assert out.shape == dels.shape == (2, 5)
|
||||
assert np.all(out[0] == out[1])
|
||||
assert dels.sum() == 5
|
||||
assert dels[1, 2] == 5
|
||||
Reference in New Issue
Block a user