Initial commit: Chai-1 protein structure prediction pipeline for WES
- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
output/
|
||||||
|
work/
|
||||||
|
.nextflow/
|
||||||
|
.nextflow.log*
|
||||||
56
Dockerfile
Executable file
56
Dockerfile
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
# Use NVIDIA CUDA base image with Ubuntu 22.04
|
||||||
|
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
LANG=C.UTF-8 \
|
||||||
|
LC_ALL=C.UTF-8 \
|
||||||
|
PYTHONUNBUFFERED=TRUE \
|
||||||
|
PYTHONFAULTHANDLER=1 \
|
||||||
|
PYTHONPYCACHEPREFIX='/tmp/.chai_pycache' \
|
||||||
|
MYPY_CACHE_DIR='/tmp/.chai_mypy_cache'
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
ca-certificates \
|
||||||
|
python3.10 \
|
||||||
|
python3.10-dev \
|
||||||
|
python3-pip \
|
||||||
|
build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
# Install chai_lab first (this will install older PyTorch)
|
||||||
|
RUN pip3 install --no-cache-dir chai_lab==0.5.2
|
||||||
|
|
||||||
|
# Force uninstall old PyTorch and related packages
|
||||||
|
RUN pip3 uninstall -y torch torchvision torchaudio
|
||||||
|
|
||||||
|
# Install PyTorch 2.6+ from main PyPI (has CUDA support built-in)
|
||||||
|
RUN pip3 install --no-cache-dir torch torchvision torchaudio
|
||||||
|
|
||||||
|
# Upgrade transformers to ensure compatibility
|
||||||
|
RUN pip3 install --no-cache-dir --upgrade "transformers>=4.30.0"
|
||||||
|
|
||||||
|
# Verify all installations
|
||||||
|
RUN python3 -c "import torch; v=torch.__version__.split('+')[0]; print(f'PyTorch: {v}'); major,minor=map(int,v.split('.')[:2]); assert (major==2 and minor>=6) or major>2, f'PyTorch {v} is too old, need 2.6+'" && \
|
||||||
|
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" && \
|
||||||
|
python3 -c "from transformers import EsmModel; print('transformers: OK')" && \
|
||||||
|
python3 -c "import typer; print('typer: OK')" && \
|
||||||
|
python3 -c "import chai_lab; print('chai_lab: OK')" && \
|
||||||
|
chai --help
|
||||||
|
|
||||||
|
# Add entry point script
|
||||||
|
COPY entrypoint.sh /workspace/
|
||||||
|
RUN chmod +x /workspace/entrypoint.sh
|
||||||
|
|
||||||
|
# Set entry point
|
||||||
|
ENTRYPOINT ["/workspace/entrypoint.sh"]
|
||||||
83
Dockerfile.chailab
Executable file
83
Dockerfile.chailab
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
FROM ubuntu:22.04 AS chailab-baseimage
|
||||||
|
|
||||||
|
ENV \
|
||||||
|
LANG=C.UTF-8 \
|
||||||
|
LC_ALL=C.UTF-8 \
|
||||||
|
# config for apt
|
||||||
|
DEBIAN_FRONTEND=noninteractive \
|
||||||
|
# default editor for git cli
|
||||||
|
EDITOR=vim \
|
||||||
|
# keep (large) mypy cache outside of working tree
|
||||||
|
MYPY_CACHE_DIR='/tmp/.chai_lab_mypy_cache' \
|
||||||
|
# always flush output from python
|
||||||
|
PYTHONUNBUFFERED=TRUE \
|
||||||
|
# enable fault handler (print tracebacks even after segfault or NCCL errors).
|
||||||
|
PYTHONFAULTHANDLER=1 \
|
||||||
|
# keep __pycache__ out of working tree
|
||||||
|
PYTHONPYCACHEPREFIX='/tmp/.chai_lab_pycache'
|
||||||
|
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
apt-get -qq update \
|
||||||
|
&& apt-get -qq install -y \
|
||||||
|
# common things
|
||||||
|
gnupg ca-certificates wget git curl aria2 lsb-release tzdata \
|
||||||
|
rsync sudo tree htop tmux unzip \
|
||||||
|
clang \
|
||||||
|
# for direct ssh into container
|
||||||
|
openssh-server socat \
|
||||||
|
# provides `fuser` command
|
||||||
|
psmisc \
|
||||||
|
# RDMA/InfiniBand
|
||||||
|
libibverbs1 librdmacm1 \
|
||||||
|
# text editors, needed by git cli
|
||||||
|
nano vim \
|
||||||
|
build-essential libstdc++6 \
|
||||||
|
# python
|
||||||
|
python3.10 python3.10-dev \
|
||||||
|
# (run continues)
|
||||||
|
# stop git from complaining about dubious ownership.
|
||||||
|
&& git config --global --add safe.directory "*" \
|
||||||
|
#
|
||||||
|
# cuda softlinking is needed in podman, but not docker
|
||||||
|
&& ln -s /lib/x86_64-linux-gnu/libcuda.so.1 /lib/x86_64-linux-gnu/libcuda.so \
|
||||||
|
&& ldconfig /lib/x86_64-linux-gnu/ \
|
||||||
|
# setup timezone, to $TZ, ubuntu-specific
|
||||||
|
# && ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \
|
||||||
|
&& dpkg-reconfigure --frontend noninteractive tzdata \
|
||||||
|
# change default shell to bash (has no effect during building)
|
||||||
|
&& chsh -s /bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
ENV \
|
||||||
|
# expose CUDA libraries. Now that we don't build anything this is likely redundant
|
||||||
|
LD_LIBRARY_PATH="/usr/local/cuda/lib64/stubs/:${LD_LIBRARY_PATH:-}" \
|
||||||
|
# Set uv timeout to larger value to account for slow download time of nvidia-cudnn-cu12
|
||||||
|
UV_HTTP_TIMEOUT=1000 \
|
||||||
|
# where virtual env will be installed
|
||||||
|
VIRTUAL_ENV=/opt/venv
|
||||||
|
|
||||||
|
# Install dependencies in virtualenv
|
||||||
|
COPY ./requirements.in /tmp/requirements.in
|
||||||
|
# from https://pythonspeed.com/articles/activate-virtualenv-dockerfile/
|
||||||
|
# a trick to have virtualenv "always activated"
|
||||||
|
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
# Install uv
|
||||||
|
curl -LsSf https://astral.sh/uv/0.5.4/install.sh | sh \
|
||||||
|
&& . $HOME/.local/bin/env \
|
||||||
|
&& uv venv --no-python-downloads $VIRTUAL_ENV \
|
||||||
|
# this is sh, not bash, so . not source
|
||||||
|
&& . $VIRTUAL_ENV/bin/activate \
|
||||||
|
&& uv pip install uv pip -r /tmp/requirements.in
|
||||||
|
|
||||||
|
|
||||||
|
# making sure envvars are set in all shells
|
||||||
|
RUN echo "PATH=\"$PATH\"" >> /etc/environment \
|
||||||
|
&& echo "LANG=\"$LANG\"" >> /etc/environment \
|
||||||
|
&& echo "LC_ALL=\"$LC_ALL\"" >> /etc/environment \
|
||||||
|
&& echo "LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\"" >> /etc/environment \
|
||||||
|
&& echo "EDITOR=\"$EDITOR\"" >> /etc/environment
|
||||||
|
|
||||||
|
# no startup command.
|
||||||
202
LICENSE
Executable file
202
LICENSE
Executable file
@@ -0,0 +1,202 @@
|
|||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright 2024 Chai Discovery
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
169
README.md
Executable file
169
README.md
Executable file
@@ -0,0 +1,169 @@
|
|||||||
|
# Chai-1
|
||||||
|
|
||||||
|
Chai-1 is a multi-modal foundation model for molecular structure prediction that performs at the state-of-the-art across a variety of benchmarks. Chai-1 enables unified prediction of proteins, small molecules, DNA, RNA, glycosylations, and more.
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src='https://github.com/chaidiscovery/chai-lab/blob/main/assets/performance_barplot.png' >
|
||||||
|
</p>
|
||||||
|
|
||||||
|
For more information on the model's performance and capabilities, see our [technical report](https://www.biorxiv.org/content/10.1101/2024.10.10.615955).
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# version on pypi:
|
||||||
|
pip install chai_lab==0.5.2
|
||||||
|
|
||||||
|
# newest available version (updates daily to test features that weren't released yet):
|
||||||
|
pip install git+https://github.com/chaidiscovery/chai-lab.git
|
||||||
|
```
|
||||||
|
|
||||||
|
This Python package requires Linux, and a GPU with CUDA and bfloat16 support. We recommend using an A100 80GB or H100 80GB or L40S 48GB chip, but A10 and A30 will work for smaller complexes. Users have also reported success with consumer-grade RTX 4090.
|
||||||
|
|
||||||
|
## Running the model
|
||||||
|
|
||||||
|
### Command line inference
|
||||||
|
|
||||||
|
You can fold a FASTA file containing all the sequences (including modified residues, nucleotides, and ligands as SMILES strings) in a complex of interest by calling:
|
||||||
|
```shell
|
||||||
|
chai fold input.fasta output_folder
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, the model generates five sample predictions, and uses embeddings without MSAs or templates. For additional information about how to supply MSAs and restraints to the model, see the documentation below, or run `chai fold --help`.
|
||||||
|
|
||||||
|
For example, to run the model with MSAs (which we recommend for improved performance), pass the `--use-msa-server` flag:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
chai fold --use-msa-server input.fasta output_folder
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are hosting your own ColabFold server, additionally pass the `--msa-server` flag with your server:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
chai fold --use-msa-server --msa-server-url "https://api.internalcolabserver.com" input.fasta output_folder
|
||||||
|
```
|
||||||
|
|
||||||
|
We also provide additional utility functions for tasks such as MSA file format conversion; see `chai --help` for details.
|
||||||
|
|
||||||
|
### Pythonic inference
|
||||||
|
|
||||||
|
The main entrypoint into the Chai-1 folding code is through the `chai_lab.chai1.run_inference` function. The following script demonstrates how to provide inputs to the model, and obtain a list of PDB files for downstream analysis:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python examples/predict_structure.py
|
||||||
|
```
|
||||||
|
|
||||||
|
To get the best performance, we recommend running the model with MSAs. The following script demonstrates how to provide MSAs to the model.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python examples/msas/predict_with_msas.py
|
||||||
|
```
|
||||||
|
|
||||||
|
For further instructions, see `"How can MSAs be provided to Chai-1?"` below.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Where are downloaded weights stored?</summary>
|
||||||
|
<p markdown="1">
|
||||||
|
By default, weights are automatically downloaded and stored in <package_root>/downloads (usually that's within site-packages).
|
||||||
|
In cases where you want to control the download location (e.g. on a mounted drive in Docker), you can use the CHAI_DOWNLOADS_DIR envvar to control the download location. For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CHAI_DOWNLOADS_DIR=/tmp/downloads python ./examples/predict_structure.py
|
||||||
|
```
|
||||||
|
</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>How can MSAs be provided to Chai-1?</summary>
|
||||||
|
<p markdown="1">
|
||||||
|
|
||||||
|
Chai-1 supports MSAs provided as an `aligned.pqt` file. This file format is similar to an `a3m` file, but has additional columns that provide metadata like the source database and sequence pairing keys. We provide code to convert `a3m` files to `aligned.pqt` files. For more information on how to provide MSAs to Chai-1, see [this documentation](examples/msas/README.md).
|
||||||
|
|
||||||
|
For user convenience, we also support automatic MSA generation via the ColabFold [MMseqs2](https://github.com/soedinglab/MMseqs2) server via the `--use-msa-server` flag. As detailed in the ColabFold [repository](https://github.com/sokrypton/ColabFold), please keep in mind that this is a shared resource. Note that the results reported in our preprint and the webserver use a different MSA search strategy than MMseqs2, though we expect results to be broadly similar.
|
||||||
|
|
||||||
|
</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>How can I customize the inputs to the model further?</summary>
|
||||||
|
<p markdown="1">
|
||||||
|
|
||||||
|
For more advanced use cases, we also expose the `chai_lab.chai1.run_folding_on_context`, which allows users to construct an `AllAtomFeatureContext` manually. This allows users to specify their own templates, MSAs, embeddings, and constraints, including support for specifying covalent bonds (for example, for specifying branched ligands). We currently provide examples of how to construct an embeddings context, an MSA context, restraint contexts, and covalent bonds. We will be releasing helper methods to build template contexts soon.
|
||||||
|
|
||||||
|
</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
## ⚡ Try it online
|
||||||
|
|
||||||
|
We provide a [web server](https://lab.chaidiscovery.com) so you can test the Chai-1 model right from your browser, without any setup.
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src='assets/chailab_online_screenshot.png' height=400 >
|
||||||
|
</p>
|
||||||
|
|
||||||
|
## Using experimental restraints
|
||||||
|
Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts or covalent bonds at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) and [covalent bond documentation](examples/covalent_bonds/README.md) for details.
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src='assets/chailab_restraints_screenshot.png' height=400 >
|
||||||
|
</p>
|
||||||
|
|
||||||
|
## 💬 Feedback
|
||||||
|
|
||||||
|
Found a 🐞? Please report it in GitHub [issues](https://github.com/chaidiscovery/chai-lab/issues).
|
||||||
|
|
||||||
|
We welcome community testing and feedback. To share observations about the model's performance, please reach via [GitHub discussions](https://github.com/chaidiscovery/chai-lab/discussions), or [via email](mailto:feedback@chaidiscovery.com).
|
||||||
|
|
||||||
|
## 🛠️ Development
|
||||||
|
|
||||||
|
We use [devcontainers](https://code.visualstudio.com/docs/devcontainers/containers) in development, which helps us ensure we work in identical environments. We recommend working inside a devcontainer if you want to make a contribution to this repository.
|
||||||
|
|
||||||
|
Devcontainers work on local Linux setup, and on remote machines over an SSH connection.
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
API is quite stable, but we recommend pinning the version in your requirements, i.e.:
|
||||||
|
|
||||||
|
```
|
||||||
|
chai_lab==0.5.2
|
||||||
|
```
|
||||||
|
|
||||||
|
## Citations
|
||||||
|
|
||||||
|
If you find Chai-1 useful in your research or use any structures produced by the model, we ask that you cite our technical report:
|
||||||
|
|
||||||
|
```
|
||||||
|
@article{Chai-1-Technical-Report,
|
||||||
|
title = {Chai-1: Decoding the molecular interactions of life},
|
||||||
|
author = {{Chai Discovery}},
|
||||||
|
year = 2024,
|
||||||
|
journal = {bioRxiv},
|
||||||
|
publisher = {Cold Spring Harbor Laboratory},
|
||||||
|
doi = {10.1101/2024.10.10.615955},
|
||||||
|
url = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955},
|
||||||
|
elocation-id = {2024.10.10.615955},
|
||||||
|
eprint = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955.full.pdf}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also access this information by running `chai citation`.
|
||||||
|
|
||||||
|
Additionally, if you use the automatic MMseqs2 MSA generation described above, please also cite:
|
||||||
|
|
||||||
|
```
|
||||||
|
@article{mirdita2022colabfold,
|
||||||
|
title={ColabFold: making protein folding accessible to all},
|
||||||
|
author={Mirdita, Milot and Sch{\"u}tze, Konstantin and Moriwaki, Yoshitaka and Heo, Lim and Ovchinnikov, Sergey and Steinegger, Martin},
|
||||||
|
journal={Nature methods},
|
||||||
|
year={2022},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Licence
|
||||||
|
|
||||||
|
Chai-1 is released under an Apache 2.0 License (both code and model weights), which means it can be used for both academic and commerical purposes, including for drug discovery.
|
||||||
|
|
||||||
|
See [LICENSE](LICENSE).
|
||||||
|
|
||||||
|
To discuss partnership and access to new internal capabilities, reach us [via email](mailto:partnerships@chaidiscovery.com).
|
||||||
10
entrypoint.sh
Executable file
10
entrypoint.sh
Executable file
@@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Check if required CUDA device is available
|
||||||
|
if ! command -v nvidia-smi &> /dev/null; then
|
||||||
|
echo "Error: NVIDIA GPU is required but not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Execute the command passed to the container
|
||||||
|
exec "$@"
|
||||||
14
input/.nextflow.log
Executable file
14
input/.nextflow.log
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
Jan-27 12:03:24.006 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf
|
||||||
|
Jan-27 12:03:24.041 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W ~ version 24.10.3
|
||||||
|
Jan-27 12:03:24.052 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/root/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.3,nf-tower@1.9.3,nf-wave@1.7.4
|
||||||
|
Jan-27 12:03:24.067 [main] INFO o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
|
||||||
|
Jan-27 12:03:24.067 [main] INFO o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
|
||||||
|
Jan-27 12:03:24.069 [main] INFO org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
|
||||||
|
Jan-27 12:03:24.074 [main] INFO org.pf4j.AbstractPluginManager - No plugins
|
||||||
|
Jan-27 12:03:24.083 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /root/.nextflow/scm
|
||||||
|
Jan-27 12:03:24.089 [main] DEBUG nextflow.cli.Launcher - Operation aborted
|
||||||
|
nextflow.exception.AbortOperationException: Cannot find script file: main.nf
|
||||||
|
at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
|
||||||
|
at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
|
||||||
|
at nextflow.cli.Launcher.run(Launcher.groovy:503)
|
||||||
|
at nextflow.cli.Launcher.main(Launcher.groovy:658)
|
||||||
4
input/growth_hormone_complex.fasta
Executable file
4
input/growth_hormone_complex.fasta
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
>protein|name=growth-hormone
|
||||||
|
FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQNPQTSLCFSESIPTPSNREETQQKSNLELLRISLLLIQSWLEPVQFLRSVFANSLVYGASDSNVYDLLKDLEEGIQTLMGRLEDGSPRTGQIFKQTYSKFDTNSHNDDALLKNYGLLYCFRKDMDKVETFLRIVQCRSVEGSCGF
|
||||||
|
>protein|name=growth-hormone-receptor
|
||||||
|
FSGSEATPGPLIFKWNHHSVFFDGYTSGGLQRFVHLHFGVSNKQLISICRKRANSKEPSSPIVPVPVGGQLLVDCSFRKLSGEGLHTYYYAAGQEEKTSDRSHRHGPGVGSCFRKTFEDGVYQCTARNEGYAYGHSITKSHRTSHQVCSRDGVPVLTENQAHLPEDFKEFTLRLKQKRQLLERGSPAMQDTFPAPSPETTVQEITSQHPGGTESPTVLRVKTEKSHQVYAGLSKYFHYAGQRGLRVLYLHKGESLARGTVTVPVKRDRGVLADRMVEAVDVQRWVGYLRNVYLTGQK
|
||||||
12
input/growth_hormone_restraints.txt
Executable file
12
input/growth_hormone_restraints.txt
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
# Restraints for growth hormone complex based on known binding interface
|
||||||
|
# Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
|
||||||
|
# Key interface contacts between growth hormone and its receptor
|
||||||
|
A 14 B 43 4.0 8.0 0.8
|
||||||
|
A 167 B 57 4.0 8.0 0.8
|
||||||
|
A 171 B 62 4.0 8.0 0.8
|
||||||
|
A 175 B 102 4.0 8.0 0.8
|
||||||
|
A 178 B 166 4.0 8.0 0.8
|
||||||
|
|
||||||
|
# Additional stabilizing contacts
|
||||||
|
A 65 B 150 4.0 9.0 0.7
|
||||||
|
A 164 B 191 4.0 9.0 0.7
|
||||||
6
input/insulin_complex.fasta
Executable file
6
input/insulin_complex.fasta
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
>protein|name=insulin-a-chain
|
||||||
|
GIVEQCCTSICSLYQLENYCN
|
||||||
|
>protein|name=insulin-b-chain
|
||||||
|
FVNQHLCGSHLVEALYLVCGERGFFYTPKT
|
||||||
|
>protein|name=insulin-receptor-l1
|
||||||
|
PQAFVNWLRGGSQQVEVFVSDLPKLRNLLQGEELLGRGSFGVVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS
|
||||||
10
input/insulin_restraints.txt
Executable file
10
input/insulin_restraints.txt
Executable file
@@ -0,0 +1,10 @@
|
|||||||
|
# Restraints for insulin complex based on known interaction sites
|
||||||
|
# Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
|
||||||
|
# Insulin A chain to B chain contacts (disulfide bonds)
|
||||||
|
A 7 B 7 3.0 5.0 0.9
|
||||||
|
A 20 B 19 3.0 5.0 0.9
|
||||||
|
|
||||||
|
# Insulin (A+B) to receptor contacts
|
||||||
|
A 12 C 155 4.0 8.0 0.8
|
||||||
|
B 24 C 210 4.0 8.0 0.8
|
||||||
|
B 25 C 215 4.0 8.0 0.8
|
||||||
45
main.nf
Executable file
45
main.nf
Executable file
@@ -0,0 +1,45 @@
|
|||||||
|
#!/usr/bin/env nextflow
|
||||||
|
nextflow.enable.dsl=2
|
||||||
|
|
||||||
|
params.input_dir = 's3://omic/eureka/chai-lab/input'
|
||||||
|
params.outdir = 's3://omic/eureka/chai-lab/output'
|
||||||
|
params.use_msa = true
|
||||||
|
params.msa_server = 'https://api.colabfold.com'
|
||||||
|
params.num_samples = 5
|
||||||
|
|
||||||
|
process CHAI1 {
|
||||||
|
container 'harbor.cluster.omic.ai/omic/chai1:latest'
|
||||||
|
publishDir params.outdir, mode: 'copy'
|
||||||
|
stageInMode 'copy'
|
||||||
|
maxForks 1
|
||||||
|
|
||||||
|
input:
|
||||||
|
path fasta
|
||||||
|
|
||||||
|
output:
|
||||||
|
path "${fasta.simpleName.replace('.fasta', '')}", emit: output_dir
|
||||||
|
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
OUTPUT_DIR=\$(basename ${fasta} .fasta)
|
||||||
|
mkdir -p \$OUTPUT_DIR
|
||||||
|
|
||||||
|
# Construct MSA parameters
|
||||||
|
MSA_OPTIONS=""
|
||||||
|
if ${params.use_msa}; then
|
||||||
|
MSA_OPTIONS="--use-msa-server --msa-server-url ${params.msa_server}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run CHAI1
|
||||||
|
chai fold \\
|
||||||
|
\$MSA_OPTIONS \\
|
||||||
|
--num-diffn-samples ${params.num_samples} \\
|
||||||
|
${fasta} \\
|
||||||
|
\$OUTPUT_DIR
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
workflow {
|
||||||
|
fasta_ch = Channel.fromPath(params.input_dir + '/*.fasta')
|
||||||
|
CHAI1(fasta_ch)
|
||||||
|
}
|
||||||
11
nextflow.config
Executable file
11
nextflow.config
Executable file
@@ -0,0 +1,11 @@
|
|||||||
|
docker {
|
||||||
|
enabled = true
|
||||||
|
temp = 'auto'
|
||||||
|
}
|
||||||
|
|
||||||
|
aws {
|
||||||
|
client {
|
||||||
|
endpoint = 'https://s3.cluster.omic.ai'
|
||||||
|
s3PathStyleAccess = true
|
||||||
|
}
|
||||||
|
}
|
||||||
83
params.json
Executable file
83
params.json
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"input_dir": {
|
||||||
|
"type": "folder",
|
||||||
|
"description": "Directory containing FASTA files and optional restraints",
|
||||||
|
"default": "s3://omic/eureka/chai-lab/input",
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "input",
|
||||||
|
"var_name": "params.input_dir",
|
||||||
|
"examples": [
|
||||||
|
"s3://omic/eureka/chai-lab/input"
|
||||||
|
],
|
||||||
|
"pattern": ".*",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Directory containing FASTA files (with *_complex.fasta suffix) and optional restraints files (with *_restraints.txt suffix)"
|
||||||
|
},
|
||||||
|
"outdir": {
|
||||||
|
"type": "folder",
|
||||||
|
"description": "Directory for chai1 prediction results",
|
||||||
|
"default": "s3://omic/eureka/chai-lab/output",
|
||||||
|
"required": true,
|
||||||
|
"pipeline_io": "output",
|
||||||
|
"var_name": "params.outdir",
|
||||||
|
"examples": [
|
||||||
|
"s3://omic/eureka/chai-lab/output"
|
||||||
|
],
|
||||||
|
"pattern": ".*",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Directory where prediction results and log files will be stored. Will be created if it doesn't exist."
|
||||||
|
},
|
||||||
|
"use_msa": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Enable/disable MSA server usage",
|
||||||
|
"default": true,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.use_msa",
|
||||||
|
"examples": [
|
||||||
|
true,
|
||||||
|
false
|
||||||
|
],
|
||||||
|
"enum": [true, false],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "Whether to use MSA server for improved predictions"
|
||||||
|
},
|
||||||
|
"msa_server": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "MSA server URL",
|
||||||
|
"default": "https://api.colabfold.com",
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.msa_server",
|
||||||
|
"examples": [
|
||||||
|
"https://api.colabfold.com"
|
||||||
|
],
|
||||||
|
"pattern": "^https?://.*",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {},
|
||||||
|
"notes": "URL of the MSA server to use when use_msa is enabled"
|
||||||
|
},
|
||||||
|
"num_samples": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of diffusion samples to generate",
|
||||||
|
"default": 5,
|
||||||
|
"required": false,
|
||||||
|
"pipeline_io": "parameter",
|
||||||
|
"var_name": "params.num_samples",
|
||||||
|
"examples": [
|
||||||
|
5,
|
||||||
|
10
|
||||||
|
],
|
||||||
|
"pattern": "^[1-9]\\d*$",
|
||||||
|
"enum": [],
|
||||||
|
"validation": {
|
||||||
|
"min": 1,
|
||||||
|
"max": 50
|
||||||
|
},
|
||||||
|
"notes": "Number of structure samples to generate using diffusion"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
74
pyproject.toml
Executable file
74
pyproject.toml
Executable file
@@ -0,0 +1,74 @@
|
|||||||
|
# important: install in editable mode
|
||||||
|
[build-system]
|
||||||
|
requires = [
|
||||||
|
"hatchling>=1.20", # build backend
|
||||||
|
"hatch-requirements-txt", # plugin, to parse requirements.txt
|
||||||
|
]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "chai_lab"
|
||||||
|
description = "Chai Discovery tools for AI + protein research."
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
authors = [{ name = "Chai Discovery" }]
|
||||||
|
# see both defined below
|
||||||
|
dynamic = ["version", "dependencies"]
|
||||||
|
|
||||||
|
[tool.hatch.version]
|
||||||
|
path = "chai_lab/__init__.py"
|
||||||
|
[tool.hatch.metadata.hooks.requirements_txt]
|
||||||
|
files = ["requirements.in"]
|
||||||
|
[tool.hatch.metadata]
|
||||||
|
allow-direct-references = true
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
check_untyped_defs = true
|
||||||
|
|
||||||
|
# Ignore missing imports for packages with missing type stubs
|
||||||
|
[[tool.mypy.overrides]]
|
||||||
|
module = [
|
||||||
|
"anarci.*",
|
||||||
|
"fsspec.*",
|
||||||
|
"google.*",
|
||||||
|
"joblib.*",
|
||||||
|
"needletail.*",
|
||||||
|
"numba.*",
|
||||||
|
"pyximport.*",
|
||||||
|
"rdkit.*",
|
||||||
|
"scipy.*",
|
||||||
|
"seaborn.*",
|
||||||
|
"sh.*",
|
||||||
|
"tmtools.*",
|
||||||
|
"botocore.*",
|
||||||
|
"s3fs.*",
|
||||||
|
"biotite.*",
|
||||||
|
"DockQ.*",
|
||||||
|
"boto3.*",
|
||||||
|
"transformers.*",
|
||||||
|
"modelcif.*",
|
||||||
|
"ihm.*",
|
||||||
|
]
|
||||||
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
cache_dir = "/tmp/.common_pytest_cache"
|
||||||
|
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.sdist]
|
||||||
|
exclude = [
|
||||||
|
"/.devcontainer",
|
||||||
|
"/.github",
|
||||||
|
"/.idea",
|
||||||
|
"/.vscode",
|
||||||
|
"/.pytest_cache",
|
||||||
|
"/assets",
|
||||||
|
"/downloads",
|
||||||
|
"/outputs",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
# should use packages from sdist section
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
chai = "chai_lab.main:cli"
|
||||||
51
requirements.in
Executable file
51
requirements.in
Executable file
@@ -0,0 +1,51 @@
|
|||||||
|
# dev-deps, still placed in the same requirements file
|
||||||
|
ruff==0.6.3 # in sync with pre-commit-hook
|
||||||
|
mypy
|
||||||
|
pytest
|
||||||
|
pre-commit
|
||||||
|
|
||||||
|
# types/stubs are required by mypy
|
||||||
|
pandas-stubs
|
||||||
|
types-pyyaml
|
||||||
|
types-tqdm
|
||||||
|
typing-extensions
|
||||||
|
types-requests
|
||||||
|
|
||||||
|
# CLI, administrator tools
|
||||||
|
typer~=0.12 # CLI generator
|
||||||
|
# pydantic~=2.5 # serialization/deserialization of configs
|
||||||
|
|
||||||
|
# notebooks, plotting
|
||||||
|
ipykernel~=6.27 # needed by vs code to run notebooks in devcontainer
|
||||||
|
# seaborn
|
||||||
|
matplotlib
|
||||||
|
|
||||||
|
# misc
|
||||||
|
tqdm~=4.66
|
||||||
|
|
||||||
|
# data import/export, application-specific
|
||||||
|
gemmi~=0.6.3 # pdb/mmcif parsing
|
||||||
|
rdkit==2023.9.5 # parsing of ligands. 2023.9.6 has broken type stubs
|
||||||
|
biopython>=1.83 # parsing, data access
|
||||||
|
antipickle==0.2.0 # save/load heterogeneous python structures
|
||||||
|
tmtools>=0.0.3 # Python bindings for the TM-align algorithm
|
||||||
|
modelcif>=1.0 # mmcif writing, confirmed to work currently latest 1.0
|
||||||
|
|
||||||
|
# commented out following optional dependencies for release on pypi
|
||||||
|
# dockq metric for comparing predicted pdbs and ground truth pdbs
|
||||||
|
# dockq @ git+https://github.com/bjornwallner/DockQ.git@v2.1.1
|
||||||
|
# pip-compatible minimized version of anarci
|
||||||
|
# anarci @ git+https://github.com/arogozhnikov/microANARCI@d81823395d0c3532d6e033d80b036b4aa4a4565e
|
||||||
|
|
||||||
|
# computing, dl
|
||||||
|
numpy~=1.21
|
||||||
|
pandas[parquet,gcp,aws]~=2.1
|
||||||
|
pandera
|
||||||
|
numba>=0.59
|
||||||
|
# polars
|
||||||
|
einops~=0.8
|
||||||
|
jaxtyping>=0.2.25 # versions <0.2.25 do not easily support runtime typechecking
|
||||||
|
beartype>=0.18 # compatible typechecker to use with jaxtyping
|
||||||
|
# do not use 2.2 because https://github.com/pytorch/pytorch/issues/122385
|
||||||
|
torch~=2.3.1
|
||||||
|
transformers~=4.44 # for esm inference
|
||||||
12
ruff.toml
Executable file
12
ruff.toml
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
# move ruff cache outside of worktree
|
||||||
|
cache-dir = "/tmp/.ruff_chai_cache"
|
||||||
|
|
||||||
|
|
||||||
|
[lint]
|
||||||
|
extend-select = ["I"]
|
||||||
|
# jaxtyping requires disabling two following errors
|
||||||
|
# https://docs.kidger.site/jaxtyping/faq/#flake8-or-ruff-are-throwing-an-error
|
||||||
|
ignore = ["F821", "F722"]
|
||||||
|
|
||||||
|
[lint.isort]
|
||||||
|
known-first-party = ["chai", "chai_lab"]
|
||||||
4
tests/__init__.py
Executable file
4
tests/__init__.py
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
37
tests/example_inputs.py
Executable file
37
tests/example_inputs.py
Executable file
@@ -0,0 +1,37 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
|
example_ligands = [
|
||||||
|
"C",
|
||||||
|
"O",
|
||||||
|
"C(C1C(C(C(C(O1)O)O)O)O)O",
|
||||||
|
"[O-]S(=O)(=O)[O-]",
|
||||||
|
"CC1=C(C(CCC1)(C)C)/C=C/C(=C/C=C/C(=C/C=O)/C)/C",
|
||||||
|
"CCC1=C(c2cc3c(c(c4n3[Mg]56[n+]2c1cc7n5c8c(c9[n+]6c(c4)C(C9CCC(=O)OC/C=C(\C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C)C)[C@H](C(=O)c8c7C)C(=O)OC)C)C=C)C=O",
|
||||||
|
r"C=CC1=C(C)/C2=C/c3c(C)c(CCC(=O)O)c4n3[Fe@TB16]35<-N2=C1/C=c1/c(C)c(C=C)/c(n13)=C/C1=N->5/C(=C\4)C(CCC(=O)O)=C1C",
|
||||||
|
# different ions
|
||||||
|
"[Mg+2]",
|
||||||
|
"[Na+]",
|
||||||
|
"[Cl-]",
|
||||||
|
]
|
||||||
|
|
||||||
|
example_proteins = [
|
||||||
|
"AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVR",
|
||||||
|
"(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)K(NH2)",
|
||||||
|
"XDHPX",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
example_rna = [
|
||||||
|
"AGUGGCUA",
|
||||||
|
"AAAAAA",
|
||||||
|
"AGUC",
|
||||||
|
]
|
||||||
|
|
||||||
|
example_dna = [
|
||||||
|
"AGTGGCTA",
|
||||||
|
"AAAAAA",
|
||||||
|
"AGTC",
|
||||||
|
]
|
||||||
24
tests/test_cif_utils.py
Executable file
24
tests/test_cif_utils.py
Executable file
@@ -0,0 +1,24 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from chai_lab.data.io.cif_utils import get_chain_letter
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_chain_letter():
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
get_chain_letter(0)
|
||||||
|
assert get_chain_letter(1) == "A"
|
||||||
|
assert get_chain_letter(26) == "Z"
|
||||||
|
assert get_chain_letter(27) == "a"
|
||||||
|
assert get_chain_letter(52) == "z"
|
||||||
|
|
||||||
|
assert get_chain_letter(53) == "AA"
|
||||||
|
assert get_chain_letter(54) == "AB"
|
||||||
|
|
||||||
|
# For one-letter codes, there are 26 + 26 = 52 codes
|
||||||
|
# For two-letter codes, there are 52 * 52 codes
|
||||||
|
assert get_chain_letter(52 * 52 + 52) == "zz"
|
||||||
108
tests/test_glycans.py
Executable file
108
tests/test_glycans.py
Executable file
@@ -0,0 +1,108 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from chai_lab.chai1 import make_all_atom_feature_context
|
||||||
|
from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
|
||||||
|
def test_parsing_ccd_codes(ccd_code: str):
|
||||||
|
"""Test that various single CCD codes are parsed correctly."""
|
||||||
|
res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
|
||||||
|
assert len(res) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_complex_parsing():
|
||||||
|
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
|
||||||
|
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
|
||||||
|
assert len(sugars) == 5
|
||||||
|
|
||||||
|
bond1, bond2, bond3, bond4 = bonds
|
||||||
|
|
||||||
|
assert bond1.src_sugar_index == 0
|
||||||
|
assert bond1.dst_sugar_index == 1
|
||||||
|
assert bond1.src_atom == 6
|
||||||
|
assert bond1.dst_atom == 1
|
||||||
|
assert bond2.src_sugar_index == 0
|
||||||
|
assert bond2.dst_sugar_index == 2
|
||||||
|
assert bond2.src_atom == 4
|
||||||
|
assert bond2.dst_atom == 1
|
||||||
|
assert bond3.src_sugar_index == 2
|
||||||
|
assert bond3.dst_sugar_index == 3
|
||||||
|
assert bond3.src_atom == 6
|
||||||
|
assert bond3.dst_atom == 1
|
||||||
|
assert bond4.src_sugar_index == 3
|
||||||
|
assert bond4.dst_sugar_index == 4
|
||||||
|
assert bond4.src_atom == 6
|
||||||
|
assert bond4.dst_atom == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_complex_parsing_2():
|
||||||
|
glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
|
||||||
|
" ", ""
|
||||||
|
)
|
||||||
|
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
|
||||||
|
assert len(sugars) == 9
|
||||||
|
|
||||||
|
expected_bonds = [
|
||||||
|
(0, 1),
|
||||||
|
(1, 2),
|
||||||
|
(1, 3),
|
||||||
|
(3, 4),
|
||||||
|
(0, 5),
|
||||||
|
(5, 6),
|
||||||
|
(6, 7),
|
||||||
|
(6, 8),
|
||||||
|
]
|
||||||
|
for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
|
||||||
|
assert bond.src_sugar_index == expected_src
|
||||||
|
assert bond.dst_sugar_index == expected_dst
|
||||||
|
|
||||||
|
|
||||||
|
def test_glycan_tokenization_with_bond():
|
||||||
|
"""Test that tokenization works, and that atoms are dropped as expected."""
|
||||||
|
glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
|
||||||
|
with TemporaryDirectory() as tmpdir:
|
||||||
|
tmp_path = Path(tmpdir)
|
||||||
|
|
||||||
|
fasta_file = tmp_path / "input.fasta"
|
||||||
|
fasta_file.write_text(glycan)
|
||||||
|
|
||||||
|
output_dir = tmp_path / "out"
|
||||||
|
|
||||||
|
feature_context = make_all_atom_feature_context(
|
||||||
|
fasta_file,
|
||||||
|
output_dir=output_dir,
|
||||||
|
use_esm_embeddings=False, # Just a test; no need
|
||||||
|
)
|
||||||
|
|
||||||
|
# Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
|
||||||
|
# The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
|
||||||
|
assert feature_context.structure_context.atom_exists_mask.sum() == 29
|
||||||
|
# We originally constructed all atoms in dropped the atoms that leave
|
||||||
|
assert feature_context.structure_context.atom_exists_mask.numel() == 30
|
||||||
|
elements = Counter(
|
||||||
|
feature_context.structure_context.atom_ref_element[
|
||||||
|
feature_context.structure_context.atom_exists_mask
|
||||||
|
].tolist()
|
||||||
|
)
|
||||||
|
assert elements[6] == 16 # 6 = Carbon
|
||||||
|
assert elements[7] == 2 # 7 = Nitrogen
|
||||||
|
assert elements[8] == 11 # 8 = Oxygen
|
||||||
|
|
||||||
|
# Single bond feature between O and C
|
||||||
|
left, right = feature_context.structure_context.atom_covalent_bond_indices
|
||||||
|
assert left.numel() == right.numel() == 1
|
||||||
|
bond_elements = set(
|
||||||
|
[
|
||||||
|
feature_context.structure_context.atom_ref_element[left].item(),
|
||||||
|
feature_context.structure_context.atom_ref_element[right].item(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert bond_elements == {8, 6}
|
||||||
106
tests/test_inference_dataset.py
Executable file
106
tests/test_inference_dataset.py
Executable file
@@ -0,0 +1,106 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tests for inference dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from chai_lab.data.dataset.inference_dataset import Input, load_chains_from_raw
|
||||||
|
from chai_lab.data.dataset.structure.all_atom_residue_tokenizer import (
|
||||||
|
AllAtomResidueTokenizer,
|
||||||
|
)
|
||||||
|
from chai_lab.data.dataset.structure.all_atom_structure_context import (
|
||||||
|
AllAtomStructureContext,
|
||||||
|
)
|
||||||
|
from chai_lab.data.dataset.structure.chain import Chain
|
||||||
|
from chai_lab.data.parsing.structure.entity_type import EntityType
|
||||||
|
from chai_lab.data.sources.rdkit import RefConformerGenerator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tokenizer() -> AllAtomResidueTokenizer:
|
||||||
|
return AllAtomResidueTokenizer(RefConformerGenerator())
|
||||||
|
|
||||||
|
|
||||||
|
def test_malformed_smiles(tokenizer: AllAtomResidueTokenizer):
|
||||||
|
"""Malformed SMILES should be dropped."""
|
||||||
|
# Zn ligand is malformed (should be [Zn+2])
|
||||||
|
inputs = [
|
||||||
|
Input("RKDESES", entity_type=EntityType.PROTEIN.value, entity_name="foo"),
|
||||||
|
Input("Zn", entity_type=EntityType.LIGAND.value, entity_name="bar"),
|
||||||
|
Input("RKEEE", entity_type=EntityType.PROTEIN.value, entity_name="baz"),
|
||||||
|
Input("EEEEEEEEEEEE", entity_type=EntityType.PROTEIN.value, entity_name="boz"),
|
||||||
|
]
|
||||||
|
chains = load_chains_from_raw(
|
||||||
|
inputs,
|
||||||
|
identifier="test",
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
assert len(chains) == 3
|
||||||
|
for chain in chains:
|
||||||
|
# NOTE this check is only valid because there are no residues that are tokenized per-atom
|
||||||
|
# Ensures that the entity data and the structure context in each chain are paired correctly
|
||||||
|
assert chain.structure_context.num_tokens == len(
|
||||||
|
chain.entity_data.full_sequence
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ions_parsing(tokenizer: AllAtomResidueTokenizer):
|
||||||
|
"""Ions as SMILES strings should carry the correct charge."""
|
||||||
|
inputs = [Input("[Mg+2]", entity_type=EntityType.LIGAND.value, entity_name="foo")]
|
||||||
|
chains = load_chains_from_raw(inputs, identifier="foo", tokenizer=tokenizer)
|
||||||
|
assert len(chains) == 1
|
||||||
|
chain = chains[0]
|
||||||
|
assert chain.structure_context.num_atoms == 1
|
||||||
|
assert chain.structure_context.atom_ref_charge == 2
|
||||||
|
assert chain.structure_context.atom_ref_element.item() == 12
|
||||||
|
|
||||||
|
|
||||||
|
def test_protein_with_smiles(tokenizer: AllAtomResidueTokenizer):
|
||||||
|
"""Complex with multiple duplicated protein chains and SMILES ligands."""
|
||||||
|
# Based on https://www.rcsb.org/structure/1AFS
|
||||||
|
seq = "MDSISLRVALNDGNFIPVLGFGTTVPEKVAKDEVIKATKIAIDNGFRHFDSAYLYEVEEEVGQAIRSKIEDGTVKREDIFYTSKLWSTFHRPELVRTCLEKTLKSTQLDYVDLYIIHFPMALQPGDIFFPRDEHGKLLFETVDICDTWEAMEKCKDAGLAKSIGVSNFNCRQLERILNKPGLKYKPVCNQVECHLYLNQSKMLDYCKSKDIILVSYCTLGSSRDKTWVDQKSPVLLDDPVLCAIAKKYKQTPALVALRYQLQRGVVPLIRSFNAKRIKELTQVFEFQLASEDMKALDGLNRNFRYNNAKYFDDHPNHPFTDEN"
|
||||||
|
nap = "NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O[P](O)(=O)OC[CH]3O[CH]([CH](O[P](O)(O)=O)[CH]3O)n4cnc5c(N)ncnc45)[CH](O)[CH]2O"
|
||||||
|
tes = "O=C4C=C3C(C2CCC1(C(CCC1O)C2CC3)C)(C)CC4"
|
||||||
|
inputs = [
|
||||||
|
Input(seq, EntityType.PROTEIN.value, entity_name="A"),
|
||||||
|
Input(seq, EntityType.PROTEIN.value, entity_name="B"),
|
||||||
|
Input(nap, EntityType.LIGAND.value, entity_name="C"),
|
||||||
|
Input(nap, EntityType.LIGAND.value, entity_name="D"),
|
||||||
|
Input(tes, EntityType.LIGAND.value, entity_name="E"),
|
||||||
|
Input(tes, EntityType.LIGAND.value, entity_name="F"),
|
||||||
|
]
|
||||||
|
chains: list[Chain] = load_chains_from_raw(inputs, tokenizer=tokenizer)
|
||||||
|
assert len(chains) == len(inputs)
|
||||||
|
|
||||||
|
example = AllAtomStructureContext.merge(
|
||||||
|
[chain.structure_context for chain in chains]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should be 1 protein chain, 2 ligand chains
|
||||||
|
assert example.token_entity_id.unique().numel() == 3
|
||||||
|
assert example.token_asym_id.unique().numel() == 6
|
||||||
|
|
||||||
|
# Check protein chains
|
||||||
|
prot_entity_ids = example.token_entity_id[
|
||||||
|
example.token_entity_type == EntityType.PROTEIN.value
|
||||||
|
]
|
||||||
|
assert torch.unique(prot_entity_ids).numel() == 1
|
||||||
|
prot_sym_ids = example.token_sym_id[
|
||||||
|
example.token_entity_type == EntityType.PROTEIN.value
|
||||||
|
]
|
||||||
|
assert torch.unique(prot_sym_ids).numel() == 2 # Two copies of this chain
|
||||||
|
|
||||||
|
# Check ligand chains
|
||||||
|
lig_entity_ids = example.token_entity_id[
|
||||||
|
example.token_entity_type == EntityType.LIGAND.value
|
||||||
|
]
|
||||||
|
assert torch.unique(lig_entity_ids).numel() == 2
|
||||||
|
lig_sym_ids = example.token_sym_id[
|
||||||
|
example.token_entity_type == EntityType.LIGAND.value
|
||||||
|
]
|
||||||
|
assert torch.unique(lig_sym_ids).numel() == 2 # Two copies of each ligand
|
||||||
36
tests/test_msa_a3m_tokenization.py
Executable file
36
tests/test_msa_a3m_tokenization.py
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
"""
|
||||||
|
Test for tokenization
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
|
||||||
|
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenization_basic():
|
||||||
|
test_sequence = "RKDES"
|
||||||
|
|
||||||
|
out, dels = tokenize_sequences_to_arrays([test_sequence])
|
||||||
|
assert out.shape == dels.shape == (1, 5)
|
||||||
|
assert np.all(
|
||||||
|
out
|
||||||
|
== np.array(
|
||||||
|
[residue_types_with_nucleotides_order[res] for res in test_sequence]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenization_with_insertion():
|
||||||
|
"""Insertions (lower case) should be ignored."""
|
||||||
|
test_sequence = "RKDES"
|
||||||
|
test_with_ins = "RKrkdesDES"
|
||||||
|
|
||||||
|
out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
|
||||||
|
assert out.shape == dels.shape == (2, 5)
|
||||||
|
assert np.all(out[0] == out[1])
|
||||||
|
assert dels.sum() == 5
|
||||||
|
assert dels[1, 2] == 5
|
||||||
25
tests/test_msa_preprocess.py
Executable file
25
tests/test_msa_preprocess.py
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from chai_lab.data.dataset.msas.msa_context import NO_PAIRING_KEY
|
||||||
|
from chai_lab.data.dataset.msas.preprocess import _UKEY_FOR_QUERY, prepair_ukey
|
||||||
|
|
||||||
|
|
||||||
|
def test_prepair_ukey():
|
||||||
|
keys = torch.tensor([1, 1, 2, 1, NO_PAIRING_KEY, 2, 3])
|
||||||
|
edit_dists = torch.arange(len(keys))
|
||||||
|
|
||||||
|
paired = prepair_ukey(keys, edit_dists)
|
||||||
|
assert list(paired) == [_UKEY_FOR_QUERY, (1, 0), (2, 0), (1, 1), (2, 1), (3, 0)]
|
||||||
|
assert set(paired.values()) == set(
|
||||||
|
[i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reverse the edit distances
|
||||||
|
paired = prepair_ukey(keys, torch.tensor(edit_dists.tolist()[::-1]))
|
||||||
|
assert list(paired) == [_UKEY_FOR_QUERY, (1, 1), (2, 1), (1, 0), (2, 0), (3, 0)]
|
||||||
|
assert set(paired.values()) == set(
|
||||||
|
[i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
|
||||||
|
)
|
||||||
79
tests/test_parsing.py
Executable file
79
tests/test_parsing.py
Executable file
@@ -0,0 +1,79 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
from chai_lab.data.parsing.fasta import read_fasta
|
||||||
|
from chai_lab.data.parsing.input_validation import (
|
||||||
|
constituents_of_modified_fasta,
|
||||||
|
identify_potential_entity_types,
|
||||||
|
)
|
||||||
|
from chai_lab.data.parsing.structure.entity_type import EntityType
|
||||||
|
|
||||||
|
from .example_inputs import example_dna, example_ligands, example_proteins, example_rna
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_protein_fasta():
|
||||||
|
parts = constituents_of_modified_fasta("RKDES")
|
||||||
|
assert parts is not None
|
||||||
|
assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))
|
||||||
|
|
||||||
|
|
||||||
|
def test_modified_protein_fasta():
|
||||||
|
parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
|
||||||
|
assert parts is not None
|
||||||
|
expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
|
||||||
|
assert all(x == y for x, y in zip(parts, expected))
|
||||||
|
|
||||||
|
|
||||||
|
def test_rna_fasta():
|
||||||
|
seq = "ACUGACG"
|
||||||
|
parts = constituents_of_modified_fasta(seq)
|
||||||
|
assert parts is not None
|
||||||
|
assert all(x == y for x, y in zip(parts, seq))
|
||||||
|
|
||||||
|
|
||||||
|
def test_dna_fasta():
|
||||||
|
seq = "ACGACTAGCAT"
|
||||||
|
parts = constituents_of_modified_fasta(seq)
|
||||||
|
assert parts is not None
|
||||||
|
assert all(x == y for x, y in zip(parts, seq))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parsing():
|
||||||
|
for ligand in example_ligands:
|
||||||
|
assert EntityType.LIGAND in identify_potential_entity_types(ligand)
|
||||||
|
|
||||||
|
for protein in example_proteins:
|
||||||
|
assert EntityType.PROTEIN in identify_potential_entity_types(protein)
|
||||||
|
|
||||||
|
for dna in example_dna:
|
||||||
|
assert EntityType.DNA in identify_potential_entity_types(dna)
|
||||||
|
|
||||||
|
for rna in example_rna:
|
||||||
|
assert EntityType.RNA in identify_potential_entity_types(rna)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fasta_parsing():
|
||||||
|
test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
|
||||||
|
with TemporaryDirectory() as tmpdir:
|
||||||
|
fa_file = Path(tmpdir) / "test.fasta"
|
||||||
|
fa_file.write_text(test_string)
|
||||||
|
records = read_fasta(fa_file)
|
||||||
|
|
||||||
|
assert len(records) == 2
|
||||||
|
assert records[0].header == "foo"
|
||||||
|
assert records[0].sequence == "RKDES"
|
||||||
|
assert records[1].header == "bar"
|
||||||
|
assert records[1].sequence == "KEDESRRR"
|
||||||
|
|
||||||
|
|
||||||
|
def test_smiles_parsing():
|
||||||
|
smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
|
||||||
|
with TemporaryDirectory() as tmpdir:
|
||||||
|
fa_file = Path(tmpdir) / "test.fasta"
|
||||||
|
fa_file.write_text(smiles)
|
||||||
|
records = read_fasta(fa_file)
|
||||||
|
assert len(records) == 1
|
||||||
24
tests/test_rdkit.py
Executable file
24
tests/test_rdkit.py
Executable file
@@ -0,0 +1,24 @@
|
|||||||
|
# Copyright (c) 2024 Chai Discovery, Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0.
|
||||||
|
# See the LICENSE file for details.
|
||||||
|
|
||||||
|
from chai_lab.data.sources.rdkit import RefConformerGenerator
|
||||||
|
|
||||||
|
|
||||||
|
def test_ref_conformer_from_smiles():
|
||||||
|
"""Test ref conformer generation from SMILES."""
|
||||||
|
smiles = "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
|
||||||
|
rcg = RefConformerGenerator()
|
||||||
|
|
||||||
|
conformer = rcg.generate(smiles)
|
||||||
|
|
||||||
|
assert len(set(conformer.atom_names)) == conformer.num_atoms
|
||||||
|
|
||||||
|
|
||||||
|
def test_ref_conformer_glycan_ccd():
|
||||||
|
"""Ref conformer from CCD code for a sugar ring."""
|
||||||
|
rcg = RefConformerGenerator()
|
||||||
|
conformer = rcg.get("MAN")
|
||||||
|
assert conformer is not None
|
||||||
|
|
||||||
|
assert len(set(conformer.atom_names)) == conformer.num_atoms
|
||||||
Reference in New Issue
Block a user