Initial commit: Chai-1 protein structure prediction pipeline for WES

- Nextflow pipeline using chai1 Docker image from Harbor - S3-based input/output paths (s3://omic/eureka/chai-lab/) - GPU-accelerated protein folding with MSA support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-16 12:55:08 +01:00
commit f971fd0e21
26 changed files with 1289 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 output/
 work/
 .nextflow/
 .nextflow.log*
--- a/56
+++ b/56
@@ -0,0 +1,56 @@
 # Use NVIDIA CUDA base image with Ubuntu 22.04
 FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive \
    LANG=C.UTF-8 \
    LC_ALL=C.UTF-8 \
    PYTHONUNBUFFERED=TRUE \
    PYTHONFAULTHANDLER=1 \
    PYTHONPYCACHEPREFIX='/tmp/.chai_pycache' \
    MYPY_CACHE_DIR='/tmp/.chai_mypy_cache'
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    wget \
    curl \
    ca-certificates \
    python3.10 \
    python3.10-dev \
    python3-pip \
    build-essential \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /workspace
 # Upgrade pip
 RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
 # Install chai_lab first (this will install older PyTorch)
 RUN pip3 install --no-cache-dir chai_lab==0.5.2
 # Force uninstall old PyTorch and related packages
 RUN pip3 uninstall -y torch torchvision torchaudio
 # Install PyTorch 2.6+ from main PyPI (has CUDA support built-in)
 RUN pip3 install --no-cache-dir torch torchvision torchaudio
 # Upgrade transformers to ensure compatibility
 RUN pip3 install --no-cache-dir --upgrade "transformers>=4.30.0"
 # Verify all installations
 RUN python3 -c "import torch; v=torch.__version__.split('+')[0]; print(f'PyTorch: {v}'); major,minor=map(int,v.split('.')[:2]); assert (major==2 and minor>=6) or major>2, f'PyTorch {v} is too old, need 2.6+'" && \
    python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" && \
    python3 -c "from transformers import EsmModel; print('transformers: OK')" && \
    python3 -c "import typer; print('typer: OK')" && \
    python3 -c "import chai_lab; print('chai_lab: OK')" && \
    chai --help
 # Add entry point script
 COPY entrypoint.sh /workspace/
 RUN chmod +x /workspace/entrypoint.sh
 # Set entry point
 ENTRYPOINT ["/workspace/entrypoint.sh"]
--- a/Dockerfile.chailab
+++ b/Dockerfile.chailab
@@ -0,0 +1,83 @@
 FROM ubuntu:22.04 AS chailab-baseimage
 ENV \
  LANG=C.UTF-8 \
  LC_ALL=C.UTF-8 \
  # config for apt
  DEBIAN_FRONTEND=noninteractive \
  # default editor for git cli
  EDITOR=vim \
  # keep (large) mypy cache outside of working tree
  MYPY_CACHE_DIR='/tmp/.chai_lab_mypy_cache' \
  # always flush output from python
  PYTHONUNBUFFERED=TRUE \
  # enable fault handler (print tracebacks even after segfault or NCCL errors).
  PYTHONFAULTHANDLER=1 \
  # keep __pycache__ out of working tree
  PYTHONPYCACHEPREFIX='/tmp/.chai_lab_pycache'
 RUN --mount=type=cache,target=/var/cache/apt \
  apt-get -qq update \
  && apt-get -qq install -y \
  # common things
  gnupg ca-certificates wget git curl aria2 lsb-release tzdata \
  rsync sudo tree htop tmux unzip \
  clang \
  # for direct ssh into container
  openssh-server socat \
  # provides `fuser` command
  psmisc \
  # RDMA/InfiniBand
  libibverbs1 librdmacm1 \
  # text editors, needed by git cli
  nano vim \
  build-essential libstdc++6 \
  # python
  python3.10 python3.10-dev \
  # (run continues)
  # stop git from complaining about dubious ownership.
  && git config --global --add safe.directory "*" \
  #
  # cuda softlinking is needed in podman, but not docker
  && ln -s /lib/x86_64-linux-gnu/libcuda.so.1 /lib/x86_64-linux-gnu/libcuda.so \
  && ldconfig /lib/x86_64-linux-gnu/ \
  # setup timezone, to $TZ, ubuntu-specific
  # && ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \
  && dpkg-reconfigure --frontend noninteractive tzdata \
  # change default shell to bash (has no effect during building)
  && chsh -s /bin/bash
 ENV \
  # expose CUDA libraries. Now that we don't build anything this is likely redundant
  LD_LIBRARY_PATH="/usr/local/cuda/lib64/stubs/:${LD_LIBRARY_PATH:-}" \
  # Set uv timeout to larger value to account for slow download time of nvidia-cudnn-cu12
  UV_HTTP_TIMEOUT=1000 \
  # where virtual env will be installed
  VIRTUAL_ENV=/opt/venv
 # Install dependencies in virtualenv
 COPY ./requirements.in /tmp/requirements.in
 # from https://pythonspeed.com/articles/activate-virtualenv-dockerfile/
 # a trick to have virtualenv "always activated"
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN --mount=type=cache,target=/root/.cache/uv \
  # Install uv
  curl -LsSf https://astral.sh/uv/0.5.4/install.sh | sh \
  && . $HOME/.local/bin/env \
  && uv venv --no-python-downloads $VIRTUAL_ENV \
  # this is sh, not bash, so . not source
  && . $VIRTUAL_ENV/bin/activate \
  && uv pip install uv pip -r /tmp/requirements.in
 # making sure envvars are set in all shells
 RUN echo "PATH=\"$PATH\"" >> /etc/environment \
  && echo "LANG=\"$LANG\"" >> /etc/environment \
  && echo "LC_ALL=\"$LC_ALL\"" >> /etc/environment \
  && echo "LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\"" >> /etc/environment \
  && echo "EDITOR=\"$EDITOR\"" >> /etc/environment
 # no startup command. 
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright 2024 Chai Discovery
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,169 @@
 # Chai-1
 Chai-1 is a multi-modal foundation model for molecular structure prediction that performs at the state-of-the-art across a variety of benchmarks. Chai-1 enables unified prediction of proteins, small molecules, DNA, RNA, glycosylations, and more.
 <p align="center">
    <img src='https://github.com/chaidiscovery/chai-lab/blob/main/assets/performance_barplot.png' >
 </p>
 For more information on the model's performance and capabilities, see our [technical report](https://www.biorxiv.org/content/10.1101/2024.10.10.615955).
 ## Installation
 ```shell
 # version on pypi:
 pip install chai_lab==0.5.2
 # newest available version (updates daily to test features that weren't released yet):
 pip install git+https://github.com/chaidiscovery/chai-lab.git
 ```
 This Python package requires Linux, and a GPU with CUDA and bfloat16 support. We recommend using an A100 80GB or H100 80GB or L40S 48GB chip, but A10 and A30 will work for smaller complexes. Users have also reported success with consumer-grade RTX 4090.
 ## Running the model
 ### Command line inference
 You can fold a FASTA file containing all the sequences (including modified residues, nucleotides, and ligands as SMILES strings) in a complex of interest by calling:
 ```shell
 chai fold input.fasta output_folder
 ```
 By default, the model generates five sample predictions, and uses embeddings without MSAs or templates. For additional information about how to supply MSAs and restraints to the model, see the documentation below, or run `chai fold --help`.
 For example, to run the model with MSAs (which we recommend for improved performance), pass the `--use-msa-server` flag:
 ```shell
 chai fold --use-msa-server input.fasta output_folder
 ```
 If you are hosting your own ColabFold server, additionally pass the `--msa-server` flag with your server:
 ```shell
 chai fold --use-msa-server --msa-server-url "https://api.internalcolabserver.com" input.fasta output_folder
 ```
 We also provide additional utility functions for tasks such as MSA file format conversion; see `chai --help` for details.
 ### Pythonic inference
 The main entrypoint into the Chai-1 folding code is through the `chai_lab.chai1.run_inference` function. The following script demonstrates how to provide inputs to the model, and obtain a list of PDB files for downstream analysis:
 ```shell
 python examples/predict_structure.py
 ```
 To get the best performance, we recommend running the model with MSAs. The following script demonstrates how to provide MSAs to the model.
 ```shell
 python examples/msas/predict_with_msas.py
 ```
 For further instructions, see `"How can MSAs be provided to Chai-1?"` below.
 <details>
 <summary>Where are downloaded weights stored?</summary>
 <p markdown="1">
 By default, weights are automatically downloaded and stored in <package_root>/downloads (usually that's within site-packages).
 In cases where you want to control the download location (e.g. on a mounted drive in Docker), you can use the CHAI_DOWNLOADS_DIR envvar to control the download location. For example:
 ```bash
 CHAI_DOWNLOADS_DIR=/tmp/downloads python ./examples/predict_structure.py 
 ```
 </p>
 </details>
 <details>
 <summary>How can MSAs be provided to Chai-1?</summary>
 <p markdown="1">
 Chai-1 supports MSAs provided as an `aligned.pqt` file. This file format is similar to an `a3m` file, but has additional columns that provide metadata like the source database and sequence pairing keys. We provide code to convert `a3m` files to `aligned.pqt` files. For more information on how to provide MSAs to Chai-1, see [this documentation](examples/msas/README.md).
 For user convenience, we also support automatic MSA generation via the ColabFold [MMseqs2](https://github.com/soedinglab/MMseqs2) server via the `--use-msa-server` flag. As detailed in the ColabFold [repository](https://github.com/sokrypton/ColabFold), please keep in mind that this is a shared resource. Note that the results reported in our preprint and the webserver use a different MSA search strategy than MMseqs2, though we expect results to be broadly similar.
 </p>
 </details>
 <details>
 <summary>How can I customize the inputs to the model further?</summary>
 <p markdown="1">
 For more advanced use cases, we also expose the `chai_lab.chai1.run_folding_on_context`, which allows users to construct an `AllAtomFeatureContext` manually. This allows users to specify their own templates, MSAs, embeddings, and constraints, including support for specifying covalent bonds (for example, for specifying branched ligands). We currently provide examples of how to construct an embeddings context, an MSA context, restraint contexts, and covalent bonds. We will be releasing helper methods to build template contexts soon.
 </p>
 </details>
 ## ⚡ Try it online
 We provide a [web server](https://lab.chaidiscovery.com) so you can test the Chai-1 model right from your browser, without any setup.
 <p align="center">
    <img src='assets/chailab_online_screenshot.png' height=400 >
 </p>
 ## Using experimental restraints
 Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts or covalent bonds at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) and [covalent bond documentation](examples/covalent_bonds/README.md) for details.
 <p align="center">
    <img src='assets/chailab_restraints_screenshot.png' height=400 >
 </p>
 ## 💬 Feedback
 Found a 🐞? Please report it in GitHub [issues](https://github.com/chaidiscovery/chai-lab/issues).
 We welcome community testing and feedback. To share observations about the model's performance, please reach via [GitHub discussions](https://github.com/chaidiscovery/chai-lab/discussions), or [via email](mailto:feedback@chaidiscovery.com).
 ## 🛠️ Development
 We use [devcontainers](https://code.visualstudio.com/docs/devcontainers/containers) in development, which helps us ensure we work in identical environments. We recommend working inside a devcontainer if you want to make a contribution to this repository.
 Devcontainers work on local Linux setup, and on remote machines over an SSH connection.
 ## Status
 API is quite stable, but we recommend pinning the version in your requirements, i.e.:
 ```
 chai_lab==0.5.2
 ```
 ## Citations
 If you find Chai-1 useful in your research or use any structures produced by the model, we ask that you cite our technical report:
 ```
@article{Chai-1-Technical-Report,
 	title        = {Chai-1: Decoding the molecular interactions of life},
 	author       = {{Chai Discovery}},
 	year         = 2024,
 	journal      = {bioRxiv},
 	publisher    = {Cold Spring Harbor Laboratory},
 	doi          = {10.1101/2024.10.10.615955},
 	url          = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955},
 	elocation-id = {2024.10.10.615955},
 	eprint       = {https://www.biorxiv.org/content/early/2024/10/11/2024.10.10.615955.full.pdf}
 }
 ```
 You can also access this information by running `chai citation`.
 Additionally, if you use the automatic MMseqs2 MSA generation described above, please also cite:
 ```
@article{mirdita2022colabfold,
  title={ColabFold: making protein folding accessible to all},
  author={Mirdita, Milot and Sch{\"u}tze, Konstantin and Moriwaki, Yoshitaka and Heo, Lim and Ovchinnikov, Sergey and Steinegger, Martin},
  journal={Nature methods},
  year={2022},
 }
 ```
 ## Licence 
 Chai-1 is released under an Apache 2.0 License (both code and model weights), which means it can be used for both academic and commerical purposes, including for drug discovery.
 See [LICENSE](LICENSE).
 To discuss partnership and access to new internal capabilities, reach us [via email](mailto:partnerships@chaidiscovery.com).
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,10 @@
 #!/bin/bash
 # Check if required CUDA device is available
 if ! command -v nvidia-smi &> /dev/null; then
    echo "Error: NVIDIA GPU is required but not found"
    exit 1
 fi
 # Execute the command passed to the container
 exec "$@"
--- a/input/.nextflow.log
+++ b/input/.nextflow.log
@@ -0,0 +1,14 @@
 Jan-27 12:03:24.006 [main] DEBUG nextflow.cli.Launcher - $> nextflow run main.nf
 Jan-27 12:03:24.041 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W  ~  version 24.10.3
 Jan-27 12:03:24.052 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/root/.nextflow/plugins; core-plugins: nf-amazon@2.9.2,nf-azure@1.10.2,nf-cloudcache@0.4.2,nf-codecommit@0.2.2,nf-console@1.1.4,nf-google@1.15.3,nf-tower@1.9.3,nf-wave@1.7.4
 Jan-27 12:03:24.067 [main] INFO  o.pf4j.DefaultPluginStatusProvider - Enabled plugins: []
 Jan-27 12:03:24.067 [main] INFO  o.pf4j.DefaultPluginStatusProvider - Disabled plugins: []
 Jan-27 12:03:24.069 [main] INFO  org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode
 Jan-27 12:03:24.074 [main] INFO  org.pf4j.AbstractPluginManager - No plugins
 Jan-27 12:03:24.083 [main] DEBUG nextflow.scm.ProviderConfig - Using SCM config path: /root/.nextflow/scm
 Jan-27 12:03:24.089 [main] DEBUG nextflow.cli.Launcher - Operation aborted
 nextflow.exception.AbortOperationException: Cannot find script file: main.nf
 	at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:536)
 	at nextflow.cli.CmdRun.run(CmdRun.groovy:325)
 	at nextflow.cli.Launcher.run(Launcher.groovy:503)
 	at nextflow.cli.Launcher.main(Launcher.groovy:658)
--- a/input/growth_hormone_complex.fasta
+++ b/input/growth_hormone_complex.fasta
@@ -0,0 +1,4 @@
 >protein|name=growth-hormone
 FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQNPQTSLCFSESIPTPSNREETQQKSNLELLRISLLLIQSWLEPVQFLRSVFANSLVYGASDSNVYDLLKDLEEGIQTLMGRLEDGSPRTGQIFKQTYSKFDTNSHNDDALLKNYGLLYCFRKDMDKVETFLRIVQCRSVEGSCGF
 >protein|name=growth-hormone-receptor
 FSGSEATPGPLIFKWNHHSVFFDGYTSGGLQRFVHLHFGVSNKQLISICRKRANSKEPSSPIVPVPVGGQLLVDCSFRKLSGEGLHTYYYAAGQEEKTSDRSHRHGPGVGSCFRKTFEDGVYQCTARNEGYAYGHSITKSHRTSHQVCSRDGVPVLTENQAHLPEDFKEFTLRLKQKRQLLERGSPAMQDTFPAPSPETTVQEITSQHPGGTESPTVLRVKTEKSHQVYAGLSKYFHYAGQRGLRVLYLHKGESLARGTVTVPVKRDRGVLADRMVEAVDVQRWVGYLRNVYLTGQK
--- a/input/growth_hormone_restraints.txt
+++ b/input/growth_hormone_restraints.txt
@@ -0,0 +1,12 @@
 # Restraints for growth hormone complex based on known binding interface
 # Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
 # Key interface contacts between growth hormone and its receptor
 A 14 B 43 4.0 8.0 0.8
 A 167 B 57 4.0 8.0 0.8
 A 171 B 62 4.0 8.0 0.8
 A 175 B 102 4.0 8.0 0.8
 A 178 B 166 4.0 8.0 0.8
 # Additional stabilizing contacts
 A 65 B 150 4.0 9.0 0.7
 A 164 B 191 4.0 9.0 0.7
--- a/input/insulin_complex.fasta
+++ b/input/insulin_complex.fasta
@@ -0,0 +1,6 @@
 >protein|name=insulin-a-chain
 GIVEQCCTSICSLYQLENYCN
 >protein|name=insulin-b-chain
 FVNQHLCGSHLVEALYLVCGERGFFYTPKT
 >protein|name=insulin-receptor-l1
 PQAFVNWLRGGSQQVEVFVSDLPKLRNLLQGEELLGRGSFGVVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS
--- a/input/insulin_restraints.txt
+++ b/input/insulin_restraints.txt
@@ -0,0 +1,10 @@
 # Restraints for insulin complex based on known interaction sites
 # Format: chain1 resid1 chain2 resid2 distance_lower distance_upper confidence
 # Insulin A chain to B chain contacts (disulfide bonds)
 A 7 B 7 3.0 5.0 0.9
 A 20 B 19 3.0 5.0 0.9
 # Insulin (A+B) to receptor contacts
 A 12 C 155 4.0 8.0 0.8
 B 24 C 210 4.0 8.0 0.8
 B 25 C 215 4.0 8.0 0.8
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,45 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl=2
 params.input_dir = 's3://omic/eureka/chai-lab/input'
 params.outdir = 's3://omic/eureka/chai-lab/output'
 params.use_msa = true
 params.msa_server = 'https://api.colabfold.com'
 params.num_samples = 5
 process CHAI1 {
    container 'harbor.cluster.omic.ai/omic/chai1:latest'
    publishDir params.outdir, mode: 'copy'
    stageInMode 'copy'
    maxForks 1
    input:
        path fasta
    output:
        path "${fasta.simpleName.replace('.fasta', '')}", emit: output_dir
    script:
        """
        OUTPUT_DIR=\$(basename ${fasta} .fasta)
        mkdir -p \$OUTPUT_DIR
        # Construct MSA parameters
        MSA_OPTIONS=""
        if ${params.use_msa}; then
            MSA_OPTIONS="--use-msa-server --msa-server-url ${params.msa_server}"
        fi
        # Run CHAI1
        chai fold \\
            \$MSA_OPTIONS \\
            --num-diffn-samples ${params.num_samples} \\
            ${fasta} \\
            \$OUTPUT_DIR
        """
 }
 workflow {
    fasta_ch = Channel.fromPath(params.input_dir + '/*.fasta')
    CHAI1(fasta_ch)
 }
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,11 @@
 docker {
    enabled = true
    temp = 'auto'
 }
 aws {
    client {
        endpoint = 'https://s3.cluster.omic.ai'
        s3PathStyleAccess = true
    }
 }
--- a/params.json
+++ b/params.json
@@ -0,0 +1,83 @@
 {
    "params": {
        "input_dir": {
            "type": "folder",
            "description": "Directory containing FASTA files and optional restraints",
            "default": "s3://omic/eureka/chai-lab/input",
            "required": true,
            "pipeline_io": "input",
            "var_name": "params.input_dir",
            "examples": [
                "s3://omic/eureka/chai-lab/input"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Directory containing FASTA files (with *_complex.fasta suffix) and optional restraints files (with *_restraints.txt suffix)"
        },
        "outdir": {
            "type": "folder",
            "description": "Directory for chai1 prediction results",
            "default": "s3://omic/eureka/chai-lab/output",
            "required": true,
            "pipeline_io": "output",
            "var_name": "params.outdir",
            "examples": [
                "s3://omic/eureka/chai-lab/output"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Directory where prediction results and log files will be stored. Will be created if it doesn't exist."
        },
        "use_msa": {
            "type": "boolean",
            "description": "Enable/disable MSA server usage",
            "default": true,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.use_msa",
            "examples": [
                true,
                false
            ],
            "enum": [true, false],
            "validation": {},
            "notes": "Whether to use MSA server for improved predictions"
        },
        "msa_server": {
            "type": "string",
            "description": "MSA server URL",
            "default": "https://api.colabfold.com",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.msa_server",
            "examples": [
                "https://api.colabfold.com"
            ],
            "pattern": "^https?://.*",
            "enum": [],
            "validation": {},
            "notes": "URL of the MSA server to use when use_msa is enabled"
        },
        "num_samples": {
            "type": "integer",
            "description": "Number of diffusion samples to generate",
            "default": 5,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.num_samples",
            "examples": [
                5,
                10
            ],
            "pattern": "^[1-9]\\d*$",
            "enum": [],
            "validation": {
                "min": 1,
                "max": 50
            },
            "notes": "Number of structure samples to generate using diffusion"
        }
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,74 @@
 # important: install in editable mode
 [build-system]
 requires = [
    "hatchling>=1.20",        # build backend
    "hatch-requirements-txt", # plugin, to parse requirements.txt
 ]
 build-backend = "hatchling.build"
 [project]
 name = "chai_lab"
 description = "Chai Discovery tools for AI + protein research."
 requires-python = ">=3.10"
 authors = [{ name = "Chai Discovery" }]
 # see both defined below
 dynamic = ["version", "dependencies"]
 [tool.hatch.version]
 path = "chai_lab/__init__.py"
 [tool.hatch.metadata.hooks.requirements_txt]
 files = ["requirements.in"]
 [tool.hatch.metadata]
 allow-direct-references = true
 [tool.mypy]
 check_untyped_defs = true
 # Ignore missing imports for packages with missing type stubs
 [[tool.mypy.overrides]]
 module = [
    "anarci.*",
    "fsspec.*",
    "google.*",
    "joblib.*",
    "needletail.*",
    "numba.*",
    "pyximport.*",
    "rdkit.*",
    "scipy.*",
    "seaborn.*",
    "sh.*",
    "tmtools.*",
    "botocore.*",
    "s3fs.*",
    "biotite.*",
    "DockQ.*",
    "boto3.*",
    "transformers.*",
    "modelcif.*",
    "ihm.*",
 ]
 ignore_missing_imports = true
 [tool.pytest.ini_options]
 cache_dir = "/tmp/.common_pytest_cache"
 [tool.hatch.build.targets.sdist]
 exclude = [
    "/.devcontainer",
    "/.github",
    "/.idea",
    "/.vscode",
    "/.pytest_cache",
    "/assets",
    "/downloads",
    "/outputs",
 ]
 [tool.hatch.build.targets.wheel]
 # should use packages from sdist section
 [project.scripts]
 chai = "chai_lab.main:cli"
--- a/requirements.in
+++ b/requirements.in
@@ -0,0 +1,51 @@
 # dev-deps, still placed in the same requirements file
 ruff==0.6.3 # in sync with pre-commit-hook
 mypy
 pytest
 pre-commit
 # types/stubs are required by mypy
 pandas-stubs
 types-pyyaml
 types-tqdm
 typing-extensions
 types-requests
 # CLI, administrator tools
 typer~=0.12        # CLI generator
 # pydantic~=2.5      # serialization/deserialization of configs
 # notebooks, plotting
 ipykernel~=6.27    # needed by vs code to run notebooks in devcontainer
 # seaborn
 matplotlib
 # misc
 tqdm~=4.66
 # data import/export, application-specific
 gemmi~=0.6.3       # pdb/mmcif parsing
 rdkit==2023.9.5    # parsing of ligands. 2023.9.6 has broken type stubs
 biopython>=1.83    # parsing, data access
 antipickle==0.2.0  # save/load heterogeneous python structures
 tmtools>=0.0.3     # Python bindings for the TM-align algorithm
 modelcif>=1.0      # mmcif writing, confirmed to work currently latest 1.0
 # commented out following optional dependencies for release on pypi
 # dockq metric for comparing predicted pdbs and ground truth pdbs
 # dockq @ git+https://github.com/bjornwallner/DockQ.git@v2.1.1
 # pip-compatible minimized version of anarci
 # anarci @ git+https://github.com/arogozhnikov/microANARCI@d81823395d0c3532d6e033d80b036b4aa4a4565e
 # computing, dl
 numpy~=1.21
 pandas[parquet,gcp,aws]~=2.1 
 pandera
 numba>=0.59
 # polars              
 einops~=0.8
 jaxtyping>=0.2.25   # versions <0.2.25 do not easily support runtime typechecking
 beartype>=0.18      # compatible typechecker to use with jaxtyping
 # do not use 2.2 because https://github.com/pytorch/pytorch/issues/122385
 torch~=2.3.1
 transformers~=4.44  # for esm inference
--- a/ruff.toml
+++ b/ruff.toml
@@ -0,0 +1,12 @@
 # move ruff cache outside of worktree
 cache-dir = "/tmp/.ruff_chai_cache"
 [lint]
 extend-select = ["I"]
 # jaxtyping requires disabling two following errors
 # https://docs.kidger.site/jaxtyping/faq/#flake8-or-ruff-are-throwing-an-error
 ignore = ["F821", "F722"]
 [lint.isort]
 known-first-party = ["chai", "chai_lab"]
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,4 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
--- a/tests/example_inputs.py
+++ b/tests/example_inputs.py
@@ -0,0 +1,37 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 example_ligands = [
    "C",
    "O",
    "C(C1C(C(C(C(O1)O)O)O)O)O",
    "[O-]S(=O)(=O)[O-]",
    "CC1=C(C(CCC1)(C)C)/C=C/C(=C/C=C/C(=C/C=O)/C)/C",
    "CCC1=C(c2cc3c(c(c4n3[Mg]56[n+]2c1cc7n5c8c(c9[n+]6c(c4)C(C9CCC(=O)OC/C=C(\C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C)C)[C@H](C(=O)c8c7C)C(=O)OC)C)C=C)C=O",
    r"C=CC1=C(C)/C2=C/c3c(C)c(CCC(=O)O)c4n3[Fe@TB16]35<-N2=C1/C=c1/c(C)c(C=C)/c(n13)=C/C1=N->5/C(=C\4)C(CCC(=O)O)=C1C",
    # different ions
    "[Mg+2]",
    "[Na+]",
    "[Cl-]",
 ]
 example_proteins = [
    "AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVR",
    "(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)K(NH2)",
    "XDHPX",
 ]
 example_rna = [
    "AGUGGCUA",
    "AAAAAA",
    "AGUC",
 ]
 example_dna = [
    "AGTGGCTA",
    "AAAAAA",
    "AGTC",
 ]
--- a/tests/test_cif_utils.py
+++ b/tests/test_cif_utils.py
@@ -0,0 +1,24 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 import pytest
 from chai_lab.data.io.cif_utils import get_chain_letter
 def test_get_chain_letter():
    with pytest.raises(AssertionError):
        get_chain_letter(0)
    assert get_chain_letter(1) == "A"
    assert get_chain_letter(26) == "Z"
    assert get_chain_letter(27) == "a"
    assert get_chain_letter(52) == "z"
    assert get_chain_letter(53) == "AA"
    assert get_chain_letter(54) == "AB"
    # For one-letter codes, there are 26 + 26 = 52 codes
    # For two-letter codes, there are 52 * 52 codes
    assert get_chain_letter(52 * 52 + 52) == "zz"
--- a/tests/test_glycans.py
+++ b/tests/test_glycans.py
@@ -0,0 +1,108 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 from collections import Counter
 from pathlib import Path
 from tempfile import TemporaryDirectory
 import pytest
 from chai_lab.chai1 import make_all_atom_feature_context
 from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
 def test_parsing_ccd_codes(ccd_code: str):
    """Test that various single CCD codes are parsed correctly."""
    res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
    assert len(res) == 1
 def test_complex_parsing():
    glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
    assert len(sugars) == 5
    bond1, bond2, bond3, bond4 = bonds
    assert bond1.src_sugar_index == 0
    assert bond1.dst_sugar_index == 1
    assert bond1.src_atom == 6
    assert bond1.dst_atom == 1
    assert bond2.src_sugar_index == 0
    assert bond2.dst_sugar_index == 2
    assert bond2.src_atom == 4
    assert bond2.dst_atom == 1
    assert bond3.src_sugar_index == 2
    assert bond3.dst_sugar_index == 3
    assert bond3.src_atom == 6
    assert bond3.dst_atom == 1
    assert bond4.src_sugar_index == 3
    assert bond4.dst_sugar_index == 4
    assert bond4.src_atom == 6
    assert bond4.dst_atom == 1
 def test_complex_parsing_2():
    glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
        " ", ""
    )
    sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
    assert len(sugars) == 9
    expected_bonds = [
        (0, 1),
        (1, 2),
        (1, 3),
        (3, 4),
        (0, 5),
        (5, 6),
        (6, 7),
        (6, 8),
    ]
    for (expected_src, expected_dst), bond in zip(expected_bonds, bonds, strict=True):
        assert bond.src_sugar_index == expected_src
        assert bond.dst_sugar_index == expected_dst
 def test_glycan_tokenization_with_bond():
    """Test that tokenization works, and that atoms are dropped as expected."""
    glycan = ">glycan|foo\nNAG(4-1 NAG)\n"
    with TemporaryDirectory() as tmpdir:
        tmp_path = Path(tmpdir)
        fasta_file = tmp_path / "input.fasta"
        fasta_file.write_text(glycan)
        output_dir = tmp_path / "out"
        feature_context = make_all_atom_feature_context(
            fasta_file,
            output_dir=output_dir,
            use_esm_embeddings=False,  # Just a test; no need
        )
    # Each NAG component is C8 H15 N O6 -> 8 + 1 + 6 = 15 heavy atoms
    # The bond between them displaces one oxygen, leaving 2 * 15 - 1 = 29 atoms
    assert feature_context.structure_context.atom_exists_mask.sum() == 29
    # We originally constructed all atoms in dropped the atoms that leave
    assert feature_context.structure_context.atom_exists_mask.numel() == 30
    elements = Counter(
        feature_context.structure_context.atom_ref_element[
            feature_context.structure_context.atom_exists_mask
        ].tolist()
    )
    assert elements[6] == 16  # 6 = Carbon
    assert elements[7] == 2  # 7 = Nitrogen
    assert elements[8] == 11  # 8 = Oxygen
    # Single bond feature between O and C
    left, right = feature_context.structure_context.atom_covalent_bond_indices
    assert left.numel() == right.numel() == 1
    bond_elements = set(
        [
            feature_context.structure_context.atom_ref_element[left].item(),
            feature_context.structure_context.atom_ref_element[right].item(),
        ]
    )
    assert bond_elements == {8, 6}
--- a/tests/test_inference_dataset.py
+++ b/tests/test_inference_dataset.py
@@ -0,0 +1,106 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 """
 Tests for inference dataset.
 """
 import pytest
 import torch
 from chai_lab.data.dataset.inference_dataset import Input, load_chains_from_raw
 from chai_lab.data.dataset.structure.all_atom_residue_tokenizer import (
    AllAtomResidueTokenizer,
 )
 from chai_lab.data.dataset.structure.all_atom_structure_context import (
    AllAtomStructureContext,
 )
 from chai_lab.data.dataset.structure.chain import Chain
 from chai_lab.data.parsing.structure.entity_type import EntityType
 from chai_lab.data.sources.rdkit import RefConformerGenerator
@pytest.fixture
 def tokenizer() -> AllAtomResidueTokenizer:
    return AllAtomResidueTokenizer(RefConformerGenerator())
 def test_malformed_smiles(tokenizer: AllAtomResidueTokenizer):
    """Malformed SMILES should be dropped."""
    # Zn ligand is malformed (should be [Zn+2])
    inputs = [
        Input("RKDESES", entity_type=EntityType.PROTEIN.value, entity_name="foo"),
        Input("Zn", entity_type=EntityType.LIGAND.value, entity_name="bar"),
        Input("RKEEE", entity_type=EntityType.PROTEIN.value, entity_name="baz"),
        Input("EEEEEEEEEEEE", entity_type=EntityType.PROTEIN.value, entity_name="boz"),
    ]
    chains = load_chains_from_raw(
        inputs,
        identifier="test",
        tokenizer=tokenizer,
    )
    assert len(chains) == 3
    for chain in chains:
        # NOTE this check is only valid because there are no residues that are tokenized per-atom
        # Ensures that the entity data and the structure context in each chain are paired correctly
        assert chain.structure_context.num_tokens == len(
            chain.entity_data.full_sequence
        )
 def test_ions_parsing(tokenizer: AllAtomResidueTokenizer):
    """Ions as SMILES strings should carry the correct charge."""
    inputs = [Input("[Mg+2]", entity_type=EntityType.LIGAND.value, entity_name="foo")]
    chains = load_chains_from_raw(inputs, identifier="foo", tokenizer=tokenizer)
    assert len(chains) == 1
    chain = chains[0]
    assert chain.structure_context.num_atoms == 1
    assert chain.structure_context.atom_ref_charge == 2
    assert chain.structure_context.atom_ref_element.item() == 12
 def test_protein_with_smiles(tokenizer: AllAtomResidueTokenizer):
    """Complex with multiple duplicated protein chains and SMILES ligands."""
    # Based on https://www.rcsb.org/structure/1AFS
    seq = "MDSISLRVALNDGNFIPVLGFGTTVPEKVAKDEVIKATKIAIDNGFRHFDSAYLYEVEEEVGQAIRSKIEDGTVKREDIFYTSKLWSTFHRPELVRTCLEKTLKSTQLDYVDLYIIHFPMALQPGDIFFPRDEHGKLLFETVDICDTWEAMEKCKDAGLAKSIGVSNFNCRQLERILNKPGLKYKPVCNQVECHLYLNQSKMLDYCKSKDIILVSYCTLGSSRDKTWVDQKSPVLLDDPVLCAIAKKYKQTPALVALRYQLQRGVVPLIRSFNAKRIKELTQVFEFQLASEDMKALDGLNRNFRYNNAKYFDDHPNHPFTDEN"
    nap = "NC(=O)c1ccc[n+](c1)[CH]2O[CH](CO[P]([O-])(=O)O[P](O)(=O)OC[CH]3O[CH]([CH](O[P](O)(O)=O)[CH]3O)n4cnc5c(N)ncnc45)[CH](O)[CH]2O"
    tes = "O=C4C=C3C(C2CCC1(C(CCC1O)C2CC3)C)(C)CC4"
    inputs = [
        Input(seq, EntityType.PROTEIN.value, entity_name="A"),
        Input(seq, EntityType.PROTEIN.value, entity_name="B"),
        Input(nap, EntityType.LIGAND.value, entity_name="C"),
        Input(nap, EntityType.LIGAND.value, entity_name="D"),
        Input(tes, EntityType.LIGAND.value, entity_name="E"),
        Input(tes, EntityType.LIGAND.value, entity_name="F"),
    ]
    chains: list[Chain] = load_chains_from_raw(inputs, tokenizer=tokenizer)
    assert len(chains) == len(inputs)
    example = AllAtomStructureContext.merge(
        [chain.structure_context for chain in chains]
    )
    # Should be 1 protein chain, 2 ligand chains
    assert example.token_entity_id.unique().numel() == 3
    assert example.token_asym_id.unique().numel() == 6
    # Check protein chains
    prot_entity_ids = example.token_entity_id[
        example.token_entity_type == EntityType.PROTEIN.value
    ]
    assert torch.unique(prot_entity_ids).numel() == 1
    prot_sym_ids = example.token_sym_id[
        example.token_entity_type == EntityType.PROTEIN.value
    ]
    assert torch.unique(prot_sym_ids).numel() == 2  # Two copies of this chain
    # Check ligand chains
    lig_entity_ids = example.token_entity_id[
        example.token_entity_type == EntityType.LIGAND.value
    ]
    assert torch.unique(lig_entity_ids).numel() == 2
    lig_sym_ids = example.token_sym_id[
        example.token_entity_type == EntityType.LIGAND.value
    ]
    assert torch.unique(lig_sym_ids).numel() == 2  # Two copies of each ligand
--- a/tests/test_msa_a3m_tokenization.py
+++ b/tests/test_msa_a3m_tokenization.py
@@ -0,0 +1,36 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 """
 Test for tokenization
 """
 import numpy as np
 from chai_lab.data.parsing.msas.a3m import tokenize_sequences_to_arrays
 from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
 def test_tokenization_basic():
    test_sequence = "RKDES"
    out, dels = tokenize_sequences_to_arrays([test_sequence])
    assert out.shape == dels.shape == (1, 5)
    assert np.all(
        out
        == np.array(
            [residue_types_with_nucleotides_order[res] for res in test_sequence]
        )
    )
 def test_tokenization_with_insertion():
    """Insertions (lower case) should be ignored."""
    test_sequence = "RKDES"
    test_with_ins = "RKrkdesDES"
    out, dels = tokenize_sequences_to_arrays([test_sequence, test_with_ins])
    assert out.shape == dels.shape == (2, 5)
    assert np.all(out[0] == out[1])
    assert dels.sum() == 5
    assert dels[1, 2] == 5
--- a/tests/test_msa_preprocess.py
+++ b/tests/test_msa_preprocess.py
@@ -0,0 +1,25 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 import torch
 from chai_lab.data.dataset.msas.msa_context import NO_PAIRING_KEY
 from chai_lab.data.dataset.msas.preprocess import _UKEY_FOR_QUERY, prepair_ukey
 def test_prepair_ukey():
    keys = torch.tensor([1, 1, 2, 1, NO_PAIRING_KEY, 2, 3])
    edit_dists = torch.arange(len(keys))
    paired = prepair_ukey(keys, edit_dists)
    assert list(paired) == [_UKEY_FOR_QUERY, (1, 0), (2, 0), (1, 1), (2, 1), (3, 0)]
    assert set(paired.values()) == set(
        [i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
    )
    # Reverse the edit distances
    paired = prepair_ukey(keys, torch.tensor(edit_dists.tolist()[::-1]))
    assert list(paired) == [_UKEY_FOR_QUERY, (1, 1), (2, 1), (1, 0), (2, 0), (3, 0)]
    assert set(paired.values()) == set(
        [i for i, val in enumerate(keys.tolist()) if val != NO_PAIRING_KEY]
    )
--- a/tests/test_parsing.py
+++ b/tests/test_parsing.py
@@ -0,0 +1,79 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from chai_lab.data.parsing.fasta import read_fasta
 from chai_lab.data.parsing.input_validation import (
    constituents_of_modified_fasta,
    identify_potential_entity_types,
 )
 from chai_lab.data.parsing.structure.entity_type import EntityType
 from .example_inputs import example_dna, example_ligands, example_proteins, example_rna
 def test_simple_protein_fasta():
    parts = constituents_of_modified_fasta("RKDES")
    assert parts is not None
    assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"]))
 def test_modified_protein_fasta():
    parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)")
    assert parts is not None
    expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"]
    assert all(x == y for x, y in zip(parts, expected))
 def test_rna_fasta():
    seq = "ACUGACG"
    parts = constituents_of_modified_fasta(seq)
    assert parts is not None
    assert all(x == y for x, y in zip(parts, seq))
 def test_dna_fasta():
    seq = "ACGACTAGCAT"
    parts = constituents_of_modified_fasta(seq)
    assert parts is not None
    assert all(x == y for x, y in zip(parts, seq))
 def test_parsing():
    for ligand in example_ligands:
        assert EntityType.LIGAND in identify_potential_entity_types(ligand)
    for protein in example_proteins:
        assert EntityType.PROTEIN in identify_potential_entity_types(protein)
    for dna in example_dna:
        assert EntityType.DNA in identify_potential_entity_types(dna)
    for rna in example_rna:
        assert EntityType.RNA in identify_potential_entity_types(rna)
 def test_fasta_parsing():
    test_string = """>foo\nRKDES\n>bar\nKEDESRRR"""
    with TemporaryDirectory() as tmpdir:
        fa_file = Path(tmpdir) / "test.fasta"
        fa_file.write_text(test_string)
        records = read_fasta(fa_file)
    assert len(records) == 2
    assert records[0].header == "foo"
    assert records[0].sequence == "RKDES"
    assert records[1].header == "bar"
    assert records[1].sequence == "KEDESRRR"
 def test_smiles_parsing():
    smiles = ">smiles\nCc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
    with TemporaryDirectory() as tmpdir:
        fa_file = Path(tmpdir) / "test.fasta"
        fa_file.write_text(smiles)
        records = read_fasta(fa_file)
    assert len(records) == 1
--- a/tests/test_rdkit.py
+++ b/tests/test_rdkit.py
@@ -0,0 +1,24 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
 from chai_lab.data.sources.rdkit import RefConformerGenerator
 def test_ref_conformer_from_smiles():
    """Test ref conformer generation from SMILES."""
    smiles = "Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)[C@H](O)CO)c2cc1C"
    rcg = RefConformerGenerator()
    conformer = rcg.generate(smiles)
    assert len(set(conformer.atom_names)) == conformer.num_atoms
 def test_ref_conformer_glycan_ccd():
    """Ref conformer from CCD code for a sugar ring."""
    rcg = RefConformerGenerator()
    conformer = rcg.get("MAN")
    assert conformer is not None
    assert len(set(conformer.atom_names)) == conformer.num_atoms