Initial commit: Chai-1 protein structure prediction pipeline for WES

- Nextflow pipeline using chai1 Docker image from Harbor
- S3-based input/output paths (s3://omic/eureka/chai-lab/)
- GPU-accelerated protein folding with MSA support

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-16 12:55:08 +01:00
commit f971fd0e21
26 changed files with 1289 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
output/
work/
.nextflow/
.nextflow.log*

56
Dockerfile Executable file
View File

@@ -0,0 +1,56 @@
# Use NVIDIA CUDA base image with Ubuntu 22.04
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
PYTHONUNBUFFERED=TRUE \
PYTHONFAULTHANDLER=1 \
PYTHONPYCACHEPREFIX='/tmp/.chai_pycache' \
MYPY_CACHE_DIR='/tmp/.chai_mypy_cache'
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
wget \
curl \
ca-certificates \
python3.10 \
python3.10-dev \
python3-pip \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /workspace
# Upgrade pip
RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
# Install chai_lab first (this will install older PyTorch)
RUN pip3 install --no-cache-dir chai_lab==0.5.2
# Force uninstall old PyTorch and related packages
RUN pip3 uninstall -y torch torchvision torchaudio
# Install PyTorch 2.6+ from main PyPI (has CUDA support built-in)
RUN pip3 install --no-cache-dir torch torchvision torchaudio
# Upgrade transformers to ensure compatibility
RUN pip3 install --no-cache-dir --upgrade "transformers>=4.30.0"
# Verify all installations
RUN python3 -c "import torch; v=torch.__version__.split('+')[0]; print(f'PyTorch: {v}'); major,minor=map(int,v.split('.')[:2]); assert (major==2 and minor>=6) or major>2, f'PyTorch {v} is too old, need 2.6+'" && \
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" && \
python3 -c "from transformers import EsmModel; print('transformers: OK')" && \
python3 -c "import typer; print('typer: OK')" && \
python3 -c "import chai_lab; print('chai_lab: OK')" && \
chai --help
# Add entry point script
COPY entrypoint.sh /workspace/
RUN chmod +x /workspace/entrypoint.sh
# Set entry point
ENTRYPOINT ["/workspace/entrypoint.sh"]

83
Dockerfile.chailab Executable file
View File

@@ -0,0 +1,83 @@
FROM ubuntu:22.04 AS chailab-baseimage
ENV \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
# config for apt
DEBIAN_FRONTEND=noninteractive \
# default editor for git cli
EDITOR=vim \
# keep (large) mypy cache outside of working tree
MYPY_CACHE_DIR='/tmp/.chai_lab_mypy_cache' \
# always flush output from python
PYTHONUNBUFFERED=TRUE \
# enable fault handler (print tracebacks even after segfault or NCCL errors).
PYTHONFAULTHANDLER=1 \
# keep __pycache__ out of working tree
PYTHONPYCACHEPREFIX='/tmp/.chai_lab_pycache'
RUN --mount=type=cache,target=/var/cache/apt \
apt-get -qq update \
&& apt-get -qq install -y \
# common things
gnupg ca-certificates wget git curl aria2 lsb-release tzdata \
rsync sudo tree htop tmux unzip \
clang \
# for direct ssh into container
openssh-server socat \
# provides `fuser` command
psmisc \
# RDMA/InfiniBand
libibverbs1 librdmacm1 \
# text editors, needed by git cli
nano vim \
build-essential libstdc++6 \
# python
python3.10 python3.10-dev \
# (run continues)
# stop git from complaining about dubious ownership.
&& git config --global --add safe.directory "*" \
#
# cuda softlinking is needed in podman, but not docker
&& ln -s /lib/x86_64-linux-gnu/libcuda.so.1 /lib/x86_64-linux-gnu/libcuda.so \
&& ldconfig /lib/x86_64-linux-gnu/ \
# setup timezone, to $TZ, ubuntu-specific
# && ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \
&& dpkg-reconfigure --frontend noninteractive tzdata \
# change default shell to bash (has no effect during building)
&& chsh -s /bin/bash
ENV \
# expose CUDA libraries. Now that we don't build anything this is likely redundant
LD_LIBRARY_PATH="/usr/local/cuda/lib64/stubs/:${LD_LIBRARY_PATH:-}" \
# Set uv timeout to larger value to account for slow download time of nvidia-cudnn-cu12
UV_HTTP_TIMEOUT=1000 \
# where virtual env will be installed
VIRTUAL_ENV=/opt/venv
# Install dependencies in virtualenv
COPY ./requirements.in /tmp/requirements.in
# from https://pythonspeed.com/articles/activate-virtualenv-dockerfile/
# a trick to have virtualenv "always activated"
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN --mount=type=cache,target=/root/.cache/uv \
# Install uv
curl -LsSf https://astral.sh/uv/0.5.4/install.sh | sh \
&& . $HOME/.local/bin/env \
&& uv venv --no-python-downloads $VIRTUAL_ENV \
# this is sh, not bash, so . not source
&& . $VIRTUAL_ENV/bin/activate \
&& uv pip install uv pip -r /tmp/requirements.in
# making sure envvars are set in all shells
RUN echo "PATH=\"$PATH\"" >> /etc/environment \
&& echo "LANG=\"$LANG\"" >> /etc/environment \
&& echo "LC_ALL=\"$LC_ALL\"" >> /etc/environment \
&& echo "LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\"" >> /etc/environment \
&& echo "EDITOR=\"$EDITOR\"" >> /etc/environment
# no startup command.

202
LICENSE Executable file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 Chai Discovery
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

169
README.md Executable file
View File

@@ -0,0 +1,169 @@
# Chai-1
Chai-1 is a multi-modal foundation model for molecular structure prediction that performs at the state-of-the-art across a variety of benchmarks. Chai-1 enables unified prediction of proteins, small molecules, DNA, RNA, glycosylations, and more.
<p align="center">
<img src='https://github.com/chaidiscovery/chai-lab/blob/main/assets/performance_barplot.png' >
</p>
For more information on the model's performance and capabilities, see our [technical report](https://www.biorxiv.org/content/10.1101/2024.10.10.615955).
## Installation
```shell
# version on pypi:
pip install chai_lab==0.5.2
# newest available version (updates daily to test features that weren't released yet):
pip install git+https://github.com/chaidiscovery/chai-lab.git
```
This Python package requires Linux, and a GPU with CUDA and bfloat16 support. We recommend using an A100 80GB or H100 80GB or L40S 48GB chip, but A10 and A30 will work for smaller complexes. Users have also reported success with consumer-grade RTX 4090.
## Running the model
### Command line inference
You can fold a FASTA file containing all the sequences (including modified residues, nucleotides, and ligands as SMILES strings) in a complex of interest by calling:
```shell
chai fold input.fasta output_folder
```
By default, the model generates five sample predictions, and uses embeddings without MSAs or templates. For additional information about how to supply MSAs and restraints to the model, see the documentation below, or run `chai fold --help`.
For example, to run the model with MSAs (which we recommend for improved performance), pass the `--use-msa-server` flag:
```shell
chai fold --use-msa-server input.fasta output_folder
```
If you are hosting your own ColabFold server, additionally pass the `--msa-server` flag with your server:
```shell
chai fold --use-msa-server --msa-server-url "https://api.internalcolabserver.com" input.fasta output_folder
```
We also provide additional utility functions for tasks such as MSA file format conversion; see `chai --help` for details.
### Pythonic inference
The main entrypoint into the Chai-1 folding code is through the `chai_lab.chai1.run_inference` function. The following script demonstrates how to provide inputs to the model, and obtain a list of PDB files for downstream analysis:
```shell
python examples/predict_structure.py
```
To get the best performance, we recommend running the model with MSAs. The following script demonstrates how to provide MSAs to the model.
```shell
python examples/msas/predict_with_msas.py
```
For further instructions, see `"How can MSAs be provided to Chai-1?"` below.
<details>
<summary>Where are downloaded weights stored?</summary>
<p markdown="1">
By default, weights are automatically downloaded and stored in <package_root>/downloads (usually that's within site-packages).
In cases where you want to control the download location (e.g. on a mounted drive in Docker), you can use the CHAI_DOWNLOADS_DIR envvar to control the download location. For example:
```bash
CHAI_DOWNLOADS_DIR=/tmp/downloads python ./examples/predict_structure.py
```
</p>
</details>
<details>
<summary>How can MSAs be provided to Chai-1?</summary>
<p markdown="1">
Chai-1 supports MSAs provided as an `aligned.pqt` file. This file format is similar to an `a3m` file, but has additional columns that provide metadata like the source database and sequence pairing keys. We provide code to convert `a3m` files to `aligned.pqt` files. For more information on how to provide MSAs to Chai-1, see [this documentation](examples/msas/README.md).
For user convenience, we also support automatic MSA generation via the ColabFold [MMseqs2](https://github.com/soedinglab/MMseqs2) server via the `--use-msa-server` flag. As detailed in the ColabFold [repository](https://github.com/sokrypton/ColabFold), please keep in mind that this is a shared resource. Note that the results reported in our preprint and the webserver use a different MSA search strategy than MMseqs2, though we expect results to be broadly similar.
</p>
</details>
<details>
<summary>How can I customize the inputs to the model further?</summary>
<p markdown="1">
For more advanced use cases, we also expose the `chai_lab.chai1.run_folding_on_context`, which allows users to construct an `AllAtomFeatureContext` manually. This allows users to specify their own templates, MSAs, embeddings, and constraints, including support for specifying covalent bonds (for example, for specifying branched ligands). We currently provide examples of how to construct an embeddings context, an MSA context, restraint contexts, and covalent bonds. We will be releasing helper methods to build template contexts soon.
</p>
</details>
## ⚡ Try it online
We provide a [web server](https://lab.chaidiscovery.com) so you can test the Chai-1 model right from your browser, without any setup.
<p align="center">
<img src='assets/chailab_online_screenshot.png' height=400 >
</p>
## Using experimental restraints
Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts or covalent bonds at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) and [covalent bond documentation](examples/covalent_bonds/README.md) for details.
<p align="center">
<img src='assets/chailab_restraints_screenshot.png' height=400 >
</p>
## 💬 Feedback
Found a 🐞? Please report it in GitHub [issues](https://github.com/chaidiscovery/chai-lab/issues).
We welcome community testing and feedback. To share observations about the model's performance, please reach via [GitHub discussions](https://github.com/chaidiscovery/chai-lab/discussions), or [via email](mailto:feedback@chaidiscovery.com).
## 🛠️ Development
We use [devcontainers](https://code.visualstudio.com/docs/devcontainers/containers) in development, which helps us ensure we work in identical environments. We recommend working inside a devcontainer if you want to make a contribution to this repository.
Devcontainers work on local Linux setup, and on remote machines over an SSH connection.
## Status
API is quite stable, but we recommend pinning the version in your requirements, i.e.:
```
chai_lab==0.5.2
```
## Citations
If you find Chai-1 useful in your research or use any structures produced by the model, we ask that you cite our technical report:
```
@article{Chai-1-Technical-Report,
title = {Chai-1: Decoding the molecular interactions of life},
author = {{Chai Discovery}},
year = 2024,
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
doi = {10.1101/2024.10.10.615955},
url = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955},
elocation-id = {2024.10.10.615955},
eprint = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955.full.pdf}
}
```
You can also access this information by running `chai citation`.
Additionally, if you use the automatic MMseqs2 MSA generation described above, please also cite:
```
@article{mirdita2022colabfold,
title={ColabFold: making protein folding accessible to all},
author={Mirdita, Milot and Sch{\"u}tze, Konstantin and Moriwaki, Yoshitaka and Heo, Lim and Ovchinnikov, Sergey and Steinegger, Martin},
journal={Nature methods},
year={2022},
}
```
## Licence
Chai-1 is released under an Apache 2.0 License (both code and model weights), which means it can be used for both academic and commerical purposes, including for drug discovery.
See [LICENSE](LICENSE).
To discuss partnership and access to new internal capabilities, reach us [via email](mailto:partnerships@chaidiscovery.com).

10
entrypoint.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/bin/bash
# Check if required CUDA device is available
if ! command -v nvidia-smi &> /dev/null; then
echo "Error: NVIDIA GPU is required but not found"
exit 1
fi
# Execute the command passed to the container
exec "$@"

14
input/.nextflow.log Executable file
View File

@@ -0,0 +1,14 @@
Jan-27 12:03:24.006 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf
Jan-27 12:03:24.041 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W ~ version 24.10.3
Jan-27 12:03:24.052 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/root/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.3,nf-tower@1.9.3,nf-wave@1.7.4
Jan-27 12:03:24.067 [main] INFO o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
Jan-27 12:03:24.067 [main] INFO o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
Jan-27 12:03:24.069 [main] INFO org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
Jan-27 12:03:24.074 [main] INFO org.pf4j.AbstractPluginManager - No plugins
Jan-27 12:03:24.083 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /root/.nextflow/scm
Jan-27 12:03:24.089 [main] DEBUG nextflow.cli.Launcher - Operation aborted
nextflow.exception.AbortOperationException: Cannot find script file: main.nf
at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
at nextflow.cli.Launcher.run(Launcher.groovy:503)
at nextflow.cli.Launcher.main(Launcher.groovy:658)

View File

@@ -0,0 +1,4 @@
>protein|name=growth-hormone
FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQNPQTSLCFSESIPTPSNREETQQKSNLELLRISLLLIQSWLEPVQFLRSVFANSLVYGASDSNVYDLLKDLEEGIQTLMGRLEDGSPRTGQIFKQTYSKFDTNSHNDDALLKNYGLLYCFRKDMDKVETFLRIVQCRSVEGSCGF
>protein|name=growth-hormone-receptor
FSGSEATPGPLIFKWNHHSVFFDGYTSGGLQRFVHLHFGVSNKQLISICRKRANSKEPSSPIVPVPVGGQLLVDCSFRKLSGEGLHTYYYAAGQEEKTSDRSHRHGPGVGSCFRKTFEDGVYQCTARNEGYAYGHSITKSHRTSHQVCSRDGVPVLTENQAHLPEDFKEFTLRLKQKRQLLERGSPAMQDTFPAPSPETTVQEITSQHPGGTESPTVLRVKTEKSHQVYAGLSKYFHYAGQRGLRVLYLHKGESLARGTVTVPVKRDRGVLADRMVEAVDVQRWVGYLRNVYLTGQK

View File

@@ -0,0 +1,12 @@
# Restraints for growth hormone complex based on known binding interface
# Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
# Key interface contacts between growth hormone and its receptor
A 14 B 43 4.0 8.0 0.8
A 167 B 57 4.0 8.0 0.8
A 171 B 62 4.0 8.0 0.8
A 175 B 102 4.0 8.0 0.8
A 178 B 166 4.0 8.0 0.8
# Additional stabilizing contacts
A 65 B 150 4.0 9.0 0.7
A 164 B 191 4.0 9.0 0.7

6
input/insulin_complex.fasta Executable file
View File

@@ -0,0 +1,6 @@
>protein|name=insulin-a-chain
GIVEQCCTSICSLYQLENYCN
>protein|name=insulin-b-chain
FVNQHLCGSHLVEALYLVCGERGFFYTPKT
>protein|name=insulin-receptor-l1
PQAFVNWLRGGSQQVEVFVSDLPKLRNLLQGEELLGRGSFGVVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS

10
input/insulin_restraints.txt Executable file
View File

@@ -0,0 +1,10 @@
# Restraints for insulin complex based on known interaction sites
# Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
# Insulin A chain to B chain contacts (disulfide bonds)
A 7 B 7 3.0 5.0 0.9
A 20 B 19 3.0 5.0 0.9
# Insulin (A+B) to receptor contacts
A 12 C 155 4.0 8.0 0.8
B 24 C 210 4.0 8.0 0.8
B 25 C 215 4.0 8.0 0.8

45
main.nf Executable file
View File

@@ -0,0 +1,45 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
params.input_dir = 's3://omic/eureka/chai-lab/input'
params.outdir = 's3://omic/eureka/chai-lab/output'
params.use_msa = true
params.msa_server = 'https://api.colabfold.com'
params.num_samples = 5
process CHAI1 {
container 'harbor.cluster.omic.ai/omic/chai1:latest'
publishDir params.outdir, mode: 'copy'
stageInMode 'copy'
maxForks 1
input:
path fasta
output:
path "${fasta.simpleName.replace('.fasta', '')}", emit: output_dir
script:
"""
OUTPUT_DIR=\$(basename ${fasta} .fasta)
mkdir -p \$OUTPUT_DIR
# Construct MSA parameters
MSA_OPTIONS=""
if ${params.use_msa}; then
MSA_OPTIONS="--use-msa-server --msa-server-url ${params.msa_server}"
fi
# Run CHAI1
chai fold \\
\$MSA_OPTIONS \\
--num-diffn-samples ${params.num_samples} \\
${fasta} \\
\$OUTPUT_DIR
"""
}
workflow {
fasta_ch = Channel.fromPath(params.input_dir + '/*.fasta')
CHAI1(fasta_ch)
}

11
nextflow.config Executable file
View File

@@ -0,0 +1,11 @@
docker {
enabled = true
temp = 'auto'
}
aws {
client {
endpoint = 'https://s3.cluster.omic.ai'
s3PathStyleAccess = true
}
}

83
params.json Executable file
View File

@@ -0,0 +1,83 @@
{
"params": {
"input_dir": {
"type": "folder",
"description": "Directory containing FASTA files and optional restraints",
"default": "s3://omic/eureka/chai-lab/input",
"required": true,
"pipeline_io": "input",
"var_name": "params.input_dir",
"examples": [
"s3://omic/eureka/chai-lab/input"
],
"pattern": ".*",
"enum": [],
"validation": {},
"notes": "Directory containing FASTA files (with *_complex.fasta suffix) and optional restraints files (with *_restraints.txt suffix)"
},
"outdir": {
"type": "folder",
"description": "Directory for chai1 prediction results",
"default": "s3://omic/eureka/chai-lab/output",
"required": true,
"pipeline_io": "output",
"var_name": "params.outdir",
"examples": [
"s3://omic/eureka/chai-lab/output"
],
"pattern": ".*",
"enum": [],
"validation": {},
"notes": "Directory where prediction results and log files will be stored. Will be created if it doesn't exist."
},
"use_msa": {
"type": "boolean",
"description": "Enable/disable MSA server usage",
"default": true,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.use_msa",
"examples": [
true,
false
],
"enum": [true, false],
"validation": {},
"notes": "Whether to use MSA server for improved predictions"
},
"msa_server": {
"type": "string",
"description": "MSA server URL",
"default": "https://api.colabfold.com",
"required": false,
"pipeline_io": "parameter",
"var_name": "params.msa_server",
"examples": [
"https://api.colabfold.com"
],
"pattern": "^https?://.*",
"enum": [],
"validation": {},
"notes": "URL of the MSA server to use when use_msa is enabled"
},
"num_samples": {
"type": "integer",
"description": "Number of diffusion samples to generate",
"default": 5,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.num_samples",
"examples": [
5,
10
],
"pattern": "^[1-9]\\d*$",
"enum": [],
"validation": {
"min": 1,
"max": 50
},
"notes": "Number of structure samples to generate using diffusion"
}
}
}

74
pyproject.toml Executable file
View File

@@ -0,0 +1,74 @@
# important: install in editable mode
[build-system]
requires = [
"hatchling>=1.20", # build backend
"hatch-requirements-txt", # plugin, to parse requirements.txt
]
build-backend = "hatchling.build"
[project]
name = "chai_lab"
description = "Chai Discovery tools for AI + protein research."
requires-python = ">=3.10"
authors = [{ name = "Chai Discovery" }]
# see both defined below
dynamic = ["version", "dependencies"]
[tool.hatch.version]
path = "chai_lab/__init__.py"
[tool.hatch.metadata.hooks.requirements_txt]
files = ["requirements.in"]
[tool.hatch.metadata]
allow-direct-references = true
[tool.mypy]
check_untyped_defs = true
# Ignore missing imports for packages with missing type stubs
[[tool.mypy.overrides]]
module = [
"anarci.*",
"fsspec.*",
"google.*",
"joblib.*",
"needletail.*",
"numba.*",
"pyximport.*",
"rdkit.*",
"scipy.*",
"seaborn.*",
"sh.*",
"tmtools.*",
"botocore.*",
"s3fs.*",
"biotite.*",
"DockQ.*",
"boto3.*",
"transformers.*",
"modelcif.*",
"ihm.*",
]
ignore_missing_imports = true
[tool.pytest.ini_options]
cache_dir = "/tmp/.common_pytest_cache"
[tool.hatch.build.targets.sdist]
exclude = [
"/.devcontainer",
"/.github",
"/.idea",
"/.vscode",
"/.pytest_cache",
"/assets",
"/downloads",
"/outputs",
]
[tool.hatch.build.targets.wheel]
# should use packages from sdist section
[project.scripts]
chai = "chai_lab.main:cli"

51
requirements.in Executable file
View File

@@ -0,0 +1,51 @@
# dev-deps, still placed in the same requirements file
ruff==0.6.3 # in sync with pre-commit-hook
mypy
pytest
pre-commit
# types/stubs are required by mypy
pandas-stubs
types-pyyaml
types-tqdm
typing-extensions
types-requests
# CLI, administrator tools
typer~=0.12 # CLI generator
# pydantic~=2.5 # serialization/deserialization of configs
# notebooks, plotting
ipykernel~=6.27 # needed by vs code to run notebooks in devcontainer
# seaborn
matplotlib
# misc
tqdm~=4.66
# data import/export, application-specific
gemmi~=0.6.3 # pdb/mmcif parsing
rdkit==2023.9.5 # parsing of ligands. 2023.9.6 has broken type stubs
biopython>=1.83 # parsing, data access
antipickle==0.2.0 # save/load heterogeneous python structures
tmtools>=0.0.3 # Python bindings for the TM-align algorithm
modelcif>=1.0 # mmcif writing, confirmed to work currently latest 1.0
# commented out following optional dependencies for release on pypi
# dockq metric for comparing predicted pdbs and ground truth pdbs
# dockq @ git+https://github.com/bjornwallner/DockQ.git@v2.1.1
# pip-compatible minimized version of anarci
# anarci @ git+https://github.com/arogozhnikov/microANARCI@d81823395d0c3532d6e033d80b036b4aa4a4565e
# computing, dl
numpy~=1.21
pandas[parquet,gcp,aws]~=2.1
pandera
numba>=0.59
# polars
einops~=0.8
jaxtyping>=0.2.25 # versions <0.2.25 do not easily support runtime typechecking
beartype>=0.18 # compatible typechecker to use with jaxtyping
# do not use 2.2 because https://github.com/pytorch/pytorch/issues/122385
torch~=2.3.1
transformers~=4.44 # for esm inference

12
ruff.toml Executable file
View File

@@ -0,0 +1,12 @@
# move ruff cache outside of worktree
cache-dir = "/tmp/.ruff_chai_cache"
[lint]
extend-select = ["I"]
# jaxtyping requires disabling two following errors
# https://docs.kidger.site/jaxtyping/faq/#flake8-or-ruff-are-throwing-an-error
ignore = ["F821", "F722"]
[lint.isort]
known-first-party = ["chai", "chai_lab"]

4
tests/__init__.py Executable file
View File

@@ -0,0 +1,4 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.

37
tests/example_inputs.py Executable file
View File

@@ -0,0 +1,37 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
example_ligands = [
"C",
"O",
"C(C1C(C(C(C(O1)O)O)O)O)O",
"[O-]S(=O)(=O)[O-]",
"CC1=C(C(CCC1)(C)C)/C=C/C(=C/C=C/C(=C/C=O)/C)/C",
"CCC1=C(c2cc3c(c(c4n3[Mg]56[n+]2c1cc7n5c8c(c9[n+]6c(c4)C(C9CCC(=O)OC/C=C(\C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C)C)[C@H](C(=O)c8c7C)C(=O)OC)C)C=C)C=O",
r"C=CC1=C(C)/C2=C/c3c(C)c(CCC(=O)O)c4n3[Fe@TB16]35<-N2=C1/C=c1/c(C)c(C=C)/c(n13)=C/C1=N->5/C(=C\4)C(CCC(=O)O)=C1C",
# different ions
"[Mg+2]",
"[Na+]",
"[Cl-]",
]
example_proteins = [
"AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVR",
"(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)K(NH2)",
"XDHPX",
]
example_rna = [
"AGUGGCUA",
"AAAAAA",
"AGUC",
]
example_dna = [
"AGTGGCTA",
"AAAAAA",
"AGTC",
]

24
tests/test_cif_utils.py Executable file
View File

@@ -0,0 +1,24 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
import pytest
from chai_lab.data.io.cif_utils import get_chain_letter
def test_get_chain_letter():
with pytest.raises(AssertionError):
get_chain_letter(0)
assert get_chain_letter(1) == "A"
assert get_chain_letter(26) == "Z"
assert get_chain_letter(27) == "a"
assert get_chain_letter(52) == "z"
assert get_chain_letter(53) == "AA"
assert get_chain_letter(54) == "AB"
# For one-letter codes, there are 26 + 26 = 52 codes
# For two-letter codes, there are 52 * 52 codes
assert get_chain_letter(52 * 52 + 52) == "zz"

108
tests/test_glycans.py Executable file
View File

@@ -0,0 +1,108 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from collections import Counter
from pathlib import Path
from tempfile import TemporaryDirectory
import pytest
from chai_lab.chai1 import make_all_atom_feature_context
from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
def test_parsing_ccd_codes(ccd_code: str):
"""Test that various single CCD codes are parsed correctly."""
res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
assert len(res) == 1
def test_complex_parsing():
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
assert len(sugars) == 5
bond1, bond2, bond3, bond4 = bonds
assert bond1.src_sugar_index == 0
assert bond1.dst_sugar_index == 1
assert bond1.src_atom == 6
assert bond1.dst_atom == 1
assert bond2.src_sugar_index == 0
assert bond2.dst_sugar_index == 2
assert bond2.src_atom == 4
assert bond2.dst_atom == 1
assert bond3.src_sugar_index == 2
assert bond3.dst_sugar_index == 3
assert bond3.src_atom == 6
assert bond3.dst_atom == 1
assert bond4.src_sugar_index == 3
assert bond4.dst_sugar_index == 4
assert bond4.src_atom == 6
assert bond4.dst_atom == 1
def test_complex_parsing_2():
glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
" ", ""
)
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
assert len(sugars) == 9
expected_bonds = [
(0, 1),
(1, 2),
(1, 3),
(3, 4),
(0, 5),
(5, 6),
(6, 7),
(6, 8),
]
for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
assert bond.src_sugar_index == expected_src
assert bond.dst_sugar_index == expected_dst
def test_glycan_tokenization_with_bond():
"""Test that tokenization works, and that atoms are dropped as expected."""
glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
with TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
fasta_file = tmp_path / "input.fasta"
fasta_file.write_text(glycan)
output_dir = tmp_path / "out"
feature_context = make_all_atom_feature_context(
fasta_file,
output_dir=output_dir,
use_esm_embeddings=False, # Just a test; no need
)
# Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
# The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
assert feature_context.structure_context.atom_exists_mask.sum() == 29
# We originally constructed all atoms in dropped the atoms that leave
assert feature_context.structure_context.atom_exists_mask.numel() == 30
elements = Counter(
feature_context.structure_context.atom_ref_element[
feature_context.structure_context.atom_exists_mask
].tolist()
)
assert elements[6] == 16 # 6 = Carbon
assert elements[7] == 2 # 7 = Nitrogen
assert elements[8] == 11 # 8 = Oxygen
# Single bond feature between O and C
left, right = feature_context.structure_context.atom_covalent_bond_indices
assert left.numel() == right.numel() == 1
bond_elements = set(
[
feature_context.structure_context.atom_ref_element[left].item(),
feature_context.structure_context.atom_ref_element[right].item(),
]
)
assert bond_elements == {8, 6}

106
tests/test_inference_dataset.py Executable file
View File

@@ -0,0 +1,106 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
"""
Tests for inference dataset.
"""
import pytest
import torch
from chai_lab.data.dataset.inference_dataset import Input, load_chains_from_raw
from chai_lab.data.dataset.structure.all_atom_residue_tokenizer import (
AllAtomResidueTokenizer,
)
from chai_lab.data.dataset.structure.all_atom_structure_context import (
AllAtomStructureContext,
)
from chai_lab.data.dataset.structure.chain import Chain
from chai_lab.data.parsing.structure.entity_type import EntityType
from chai_lab.data.sources.rdkit import RefConformerGenerator
@pytest.fixture
def tokenizer() -> AllAtomResidueTokenizer:
return AllAtomResidueTokenizer(RefConformerGenerator())
def test_malformed_smiles(tokenizer: AllAtomResidueTokenizer):
"""Malformed SMILES should be dropped."""
# Zn ligand is malformed (should be [Zn+2])
inputs = [
Input("RKDESES", entity_type=EntityType.PROTEIN.value, entity_name="foo"),
Input("Zn", entity_type=EntityType.LIGAND.value, entity_name="bar"),
Input("RKEEE", entity_type=EntityType.PROTEIN.value, entity_name="baz"),
Input("EEEEEEEEEEEE", entity_type=EntityType.PROTEIN.value, entity_name="boz"),
]
chains = load_chains_from_raw(
inputs,
identifier="test",
tokenizer=tokenizer,
)
assert len(chains) == 3
for chain in chains:
# NOTE this check is only valid because there are no residues that are tokenized per-atom
# Ensures that the entity data and the structure context in each chain are paired correctly
assert chain.structure_context.num_tokens == len(
chain.entity_data.full_sequence
)
def test_ions_parsing(tokenizer: AllAtomResidueTokenizer):
"""Ions as SMILES strings should carry the correct charge."""
inputs = [Input("[Mg+2]", entity_type=EntityType.LIGAND.value, entity_name="foo")]
chains = load_chains_from_raw(inputs, identifier="foo", tokenizer=tokenizer)
assert len(chains) == 1
chain = chains[0]
assert chain.structure_context.num_atoms == 1
assert chain.structure_context.atom_ref_charge == 2
assert chain.structure_context.atom_ref_element.item() == 12
def test_protein_with_smiles(tokenizer: AllAtomResidueTokenizer):
"""Complex with multiple duplicated protein chains and SMILES ligands."""
# Based on https://www.rcsb.org/structure/1AFS
seq = "MDSISLRVALNDGNFIPVLGFGTTVPEKVAKDEVIKATKIAIDNGFRHFDSAYLYEVEEEVGQAIRSKIEDGTVKREDIFYTSKLWSTFHRPELVRTCLEKTLKSTQLDYVDLYIIHFPMALQPGDIFFPRDEHGKLLFETVDICDTWEAMEKCKDAGLAKSIGVSNFNCRQLERILNKPGLKYKPVCNQVECHLYLNQSKMLDYCKSKDIILVSYCTLGSSRDKTWVDQKSPVLLDDPVLCAIAKKYKQTPALVALRYQLQRGVVPLIRSFNAKRIKELTQVFEFQLASEDMKALDGLNRNFRYNNAKYFDDHPNHPFTDEN"
nap = "NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O[P](O)(=O)OC[CH]3O[CH]([CH](O[P](O)(O)=O)[CH]3O)n4cnc5c(N)ncnc45)[CH](O)[CH]2O"
tes = "O=C4C=C3C(C2CCC1(C(CCC1O)C2CC3)C)(C)CC4"
inputs = [
Input(seq, EntityType.PROTEIN.value, entity_name="A"),
Input(seq, EntityType.PROTEIN.value, entity_name="B"),
Input(nap, EntityType.LIGAND.value, entity_name="C"),
Input(nap, EntityType.LIGAND.value, entity_name="D"),
Input(tes, EntityType.LIGAND.value, entity_name="E"),
Input(tes, EntityType.LIGAND.value, entity_name="F"),
]
chains: list[Chain] = load_chains_from_raw(inputs, tokenizer=tokenizer)
assert len(chains) == len(inputs)
example = AllAtomStructureContext.merge(
[chain.structure_context for chain in chains]
)
# Should be 1 protein chain, 2 ligand chains
assert example.token_entity_id.unique().numel() == 3
assert example.token_asym_id.unique().numel() == 6
# Check protein chains
prot_entity_ids = example.token_entity_id[
example.token_entity_type == EntityType.PROTEIN.value
]
assert torch.unique(prot_entity_ids).numel() == 1
prot_sym_ids = example.token_sym_id[
example.token_entity_type == EntityType.PROTEIN.value
]
assert torch.unique(prot_sym_ids).numel() == 2 # Two copies of this chain
# Check ligand chains
lig_entity_ids = example.token_entity_id[
example.token_entity_type == EntityType.LIGAND.value
]
assert torch.unique(lig_entity_ids).numel() == 2
lig_sym_ids = example.token_sym_id[
example.token_entity_type == EntityType.LIGAND.value
]
assert torch.unique(lig_sym_ids).numel() == 2 # Two copies of each ligand

View File

@@ -0,0 +1,36 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
"""
Test for tokenization
"""
import numpy as np
from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
def test_tokenization_basic():
test_sequence = "RKDES"
out, dels = tokenize_sequences_to_arrays([test_sequence])
assert out.shape == dels.shape == (1, 5)
assert np.all(
out
== np.array(
[residue_types_with_nucleotides_order[res] for res in test_sequence]
)
)
def test_tokenization_with_insertion():
"""Insertions (lower case) should be ignored."""
test_sequence = "RKDES"
test_with_ins = "RKrkdesDES"
out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
assert out.shape == dels.shape == (2, 5)
assert np.all(out[0] == out[1])
assert dels.sum() == 5
assert dels[1, 2] == 5

25
tests/test_msa_preprocess.py Executable file
View File

@@ -0,0 +1,25 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
import torch
from chai_lab.data.dataset.msas.msa_context import NO_PAIRING_KEY
from chai_lab.data.dataset.msas.preprocess import _UKEY_FOR_QUERY, prepair_ukey
def test_prepair_ukey():
keys = torch.tensor([1, 1, 2, 1, NO_PAIRING_KEY, 2, 3])
edit_dists = torch.arange(len(keys))
paired = prepair_ukey(keys, edit_dists)
assert list(paired) == [_UKEY_FOR_QUERY, (1, 0), (2, 0), (1, 1), (2, 1), (3, 0)]
assert set(paired.values()) == set(
[i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
)
# Reverse the edit distances
paired = prepair_ukey(keys, torch.tensor(edit_dists.tolist()[::-1]))
assert list(paired) == [_UKEY_FOR_QUERY, (1, 1), (2, 1), (1, 0), (2, 0), (3, 0)]
assert set(paired.values()) == set(
[i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
)

79
tests/test_parsing.py Executable file
View File

@@ -0,0 +1,79 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from pathlib import Path
from tempfile import TemporaryDirectory
from chai_lab.data.parsing.fasta import read_fasta
from chai_lab.data.parsing.input_validation import (
constituents_of_modified_fasta,
identify_potential_entity_types,
)
from chai_lab.data.parsing.structure.entity_type import EntityType
from .example_inputs import example_dna, example_ligands, example_proteins, example_rna
def test_simple_protein_fasta():
parts = constituents_of_modified_fasta("RKDES")
assert parts is not None
assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))
def test_modified_protein_fasta():
parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
assert parts is not None
expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
assert all(x == y for x, y in zip(parts, expected))
def test_rna_fasta():
seq = "ACUGACG"
parts = constituents_of_modified_fasta(seq)
assert parts is not None
assert all(x == y for x, y in zip(parts, seq))
def test_dna_fasta():
seq = "ACGACTAGCAT"
parts = constituents_of_modified_fasta(seq)
assert parts is not None
assert all(x == y for x, y in zip(parts, seq))
def test_parsing():
for ligand in example_ligands:
assert EntityType.LIGAND in identify_potential_entity_types(ligand)
for protein in example_proteins:
assert EntityType.PROTEIN in identify_potential_entity_types(protein)
for dna in example_dna:
assert EntityType.DNA in identify_potential_entity_types(dna)
for rna in example_rna:
assert EntityType.RNA in identify_potential_entity_types(rna)
def test_fasta_parsing():
test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
with TemporaryDirectory() as tmpdir:
fa_file = Path(tmpdir) / "test.fasta"
fa_file.write_text(test_string)
records = read_fasta(fa_file)
assert len(records) == 2
assert records[0].header == "foo"
assert records[0].sequence == "RKDES"
assert records[1].header == "bar"
assert records[1].sequence == "KEDESRRR"
def test_smiles_parsing():
smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
with TemporaryDirectory() as tmpdir:
fa_file = Path(tmpdir) / "test.fasta"
fa_file.write_text(smiles)
records = read_fasta(fa_file)
assert len(records) == 1

24
tests/test_rdkit.py Executable file
View File

@@ -0,0 +1,24 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
from chai_lab.data.sources.rdkit import RefConformerGenerator
def test_ref_conformer_from_smiles():
"""Test ref conformer generation from SMILES."""
smiles = "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
rcg = RefConformerGenerator()
conformer = rcg.generate(smiles)
assert len(set(conformer.atom_names)) == conformer.num_atoms
def test_ref_conformer_glycan_ccd():
"""Ref conformer from CCD code for a sugar ring."""
rcg = RefConformerGenerator()
conformer = rcg.get("MAN")
assert conformer is not None
assert len(set(conformer.atom_names)) == conformer.num_atoms