Configure PRODIGY pipeline for WES execution with S3 and Harbor
Some checks failed
ci / test (3.10) (push) Has been cancelled
ci / test (3.11) (push) Has been cancelled
ci / test (3.12) (push) Has been cancelled
ci / test (3.13) (push) Has been cancelled
ci / test (3.9) (push) Has been cancelled

This commit is contained in:
2026-03-17 16:38:16 +01:00
commit 19fd443501
38 changed files with 16328 additions and 0 deletions

49
.github/workflows/ci.yml vendored Normal file
View File

@@ -0,0 +1,49 @@
name: ci
on: push
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
fail-fast: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- run: pip install '.[dev]'
- name: check types
run: mypy .
- name: run unittests
run: >-
pytest
-m "not integration"
--cov
--cov-report xml:coverage.xml
--cov-append
-vv
--hypothesis-show-statistics
- name: run integration tests
run: >-
pytest
-m integration
--cov
--cov-report xml:coverage.xml
--cov-append
-vv
--hypothesis-show-statistics
- name: Run codacy-coverage-reporter
uses: codacy/codacy-coverage-reporter-action@v1
with:
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
coverage-reports: coverage.xml

48
.github/workflows/docker-publish.yml vendored Normal file
View File

@@ -0,0 +1,48 @@
#
name: Create and publish a Docker image
on:
push:
# run only against tags
tags:
- "*"
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
build-and-push-image:
runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
# Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Build and push
uses: docker/build-push-action@v5
with:
context: .
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

34
.github/workflows/publish.yml vendored Normal file
View File

@@ -0,0 +1,34 @@
name: publish to pypi
on:
release:
types: [published]
jobs:
pypi_release:
name: builds and publishes to pypi
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/prodigy-prot
permissions:
id-token: write
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- name: install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade build
- name: build
run: |
python -m build
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

27
.github/workflows/stale.yml vendored Normal file
View File

@@ -0,0 +1,27 @@
name: "Close stale issues and PRs"
on:
schedule:
- cron: "30 1 * * *"
workflow_dispatch:
jobs:
stale:
runs-on: ubuntu-latest
permissions:
contents: write
issues: write
pull-requests: write
actions: write
steps:
- uses: actions/stale@v10
with:
stale-pr-message: "This PR is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
days-before-stale: 30
days-before-close: 5
exempt-issue-labels: "bug"
exempt-pr-labels: "bug"
remove-stale-when-updated: true
operations-per-run: 100

13
.gitignore vendored Normal file
View File

@@ -0,0 +1,13 @@
work/
.nextflow/
.nextflow.log*
*.log.*
results/
__pycache__/
*.pyc
.docker/
.vscode/
.idea/
*.tmp
*.swp
tests/test_data/dataset.tgz

9
.howfairis.yml Normal file
View File

@@ -0,0 +1,9 @@
## Uncomment a line if you want to skip a given category of checks
#skip_repository_checks_reason: <reason for skipping goes here>
#skip_license_checks_reason: <reason for skipping goes here>
#skip_registry_checks_reason: <reason for skipping goes here>
#skip_citation_checks_reason: <reason for skipping goes here>
skip_checklist_checks_reason: "I'm using the Codacy dashboard to guide my development"
ignore_commented_badges: false

47
CITATION.cff Normal file
View File

@@ -0,0 +1,47 @@
# This CITATION.cff file was generated with cffinit.
# Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0
title: Prodigy
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- given-names: Anna
family-names: Vangone
affiliation: Utrecht University
- given-names: Alexandre
name-particle: MJJ
family-names: Bonvin
affiliation: Utrecht University
- given-names: Joerg
family-names: Schaarschmidt
affiliation: Utrecht University
- given-names: Rodrigo
family-names: Vargas Honorato
affiliation: Utrecht University
- given-names: Brian
family-names: Jimenez
affiliation: Utrecht University
- given-names: Joao
family-names: Rodrigues
affiliation: Utrecht University
identifiers:
- type: doi
value: 10.1093/bioinformatics/btw514
description: DOI of the web service version
- type: doi
value: 10.7554/eLife.07454
- type: doi
value: 10.1016/j.jmb.2014.04.017
repository-code: 'https://github.com/haddocking/prodigy'
url: 'https://wenmr.science.uu.nl/prodigy'
abstract: >-
A tool to predict binding affinity values for
protein-protein complexes from atomic structures.
keywords:
- binding affinity
- computational biology
- protein-protein
license: Apache-2.0

132
CODE_OF_CONDUCT.md Normal file
View File

@@ -0,0 +1,132 @@
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
- Demonstrating empathy and kindness toward other people
- Being respectful of differing opinions, viewpoints, and experiences
- Giving and gracefully accepting constructive feedback
- Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
- Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
- The use of sexualized language or imagery, and sexual attention or advances of
any kind
- Trolling, insulting or derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or email address,
without their explicit permission
- Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
`prodigy.bonvinlab@gmail.com`.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations

17
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,17 @@
# Contributing with PRODIGY
## Reporting issues
If you find a bug or have a feature request, please report it in the [issue tracker](https://github.com/haddocking/prodigy/issues)
## Contributing code
We welcome contributions to PRODIGY. If you would like to contribute, please fork the repository and make a pull request.
## Development conventions
Please refer to the [development guidelines](DEVELOPMENT.md) for more details.
## Contact
If you have any questions, please contact us at [ask.bioexcel.eu](https://ask.bioexcel.eu)

36
DEVELOPMENT.md Normal file
View File

@@ -0,0 +1,36 @@
# PRODIGY Development
## Installation
We use `poetry` to manage the dependencies and the virtual environment, so you need to install it first; check the [official documentation](https://python-poetry.org/docs/#installation) for more details.
Clone the repository and install the dependencies:
```text
git clone https://github.com/haddocking/prodigy.git && cd prodigy
poetry install
```
## Testing
To run the tests, use the following command:
```text
python -m unittest
```
## Code style
We use `trunk` as the "all-purpose" linting tool, check its [documentation](https://docs.trunk.io/docs/install).
To check for code style issues, run:
```text
trunk check
```
To automatically fix the issues, run:
```text
trunk fmt
```

40
Dockerfile Normal file
View File

@@ -0,0 +1,40 @@
FROM python:3.12
LABEL maintainer="Omic"
LABEL description="PRODIGY - PROtein binDIng enerGY prediction"
LABEL version="2.4.0"
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# Install system dependencies required for freesasa compilation
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
build-essential \
gcc \
g++ \
make \
procps \
&& rm -rf /var/lib/apt/lists/*
# Upgrade pip
RUN pip install --no-cache-dir --upgrade pip
# Install PRODIGY and its dependencies
# Dependencies: biopython>=1.80, freesasa>=2.2.1, numpy>=2
RUN pip install --no-cache-dir \
"biopython>=1.80" \
"freesasa>=2.2.1" \
"numpy>=2"
# Install PRODIGY
RUN pip install --no-cache-dir prodigy-prot==2.4.0
# Verify installation
RUN prodigy --help
# Set working directory
WORKDIR /data
CMD ["prodigy", "--help"]

190
LICENSE Normal file
View File

@@ -0,0 +1,190 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2015 Anna Vangone, Panagiotis Kastritis, Alexandre Bonvin
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

3
MANIFEST.in Normal file
View File

@@ -0,0 +1,3 @@
include README.md
include src/prodigy_prot/data/naccess.config

341
README.md Normal file
View File

@@ -0,0 +1,341 @@
# PRODIGY Nextflow Pipeline
A Nextflow pipeline for predicting binding affinity of protein-protein complexes using PRODIGY (PROtein binDIng enerGY prediction).
## Overview
PRODIGY is a contact-based method for predicting the binding affinity of protein-protein complexes from their 3D structures. This pipeline containerizes PRODIGY using Docker and orchestrates execution through Nextflow, enabling reproducible, scalable analysis of protein-protein interactions.
### Key Features
- **Automated binding affinity prediction** from PDB/mmCIF structures
- **Batch processing** of multiple protein complexes
- **Docker containerization** for reproducibility
- **Configurable parameters** for distance cutoffs, temperature, and chain selection
- **Optional outputs** including contact lists and PyMOL visualization scripts
## Scientific Background
PRODIGY predicts binding affinity by analyzing intermolecular contacts (ICs) at protein-protein interfaces. The method:
1. Identifies residue-residue contacts within a distance threshold (default: 5.5 Å)
2. Classifies contacts by residue type (charged, polar, apolar)
3. Analyzes the non-interacting surface (NIS) composition
4. Predicts binding free energy (ΔG) and dissociation constant (Kd)
The 5.5 Å distance cutoff was optimized to capture various non-bonded interactions including salt bridges, hydrogen bonds, and hydrophobic contacts.
## Requirements
### Software Dependencies
- [Nextflow](https://www.nextflow.io/) (≥21.04.0)
- [Docker](https://www.docker.com/) (≥20.10) or [Singularity](https://sylabs.io/singularity/) (≥3.0)
### Hardware Requirements
- CPU: 1+ cores per process
- Memory: 4 GB minimum recommended
- Storage: ~2 GB for Docker image
## Installation
### 1. Clone or Download the Pipeline
```bash
# Create pipeline directory
mkdir -p /path/to/prodigy_pipeline
cd /path/to/prodigy_pipeline
# Copy pipeline files (Dockerfile, main.nf, nextflow.config, params.json)
```
### 2. Build the Docker Image
```bash
docker build -t prodigy:latest .
```
### 3. Verify Installation
```bash
# Test Docker image
docker run --rm prodigy:latest prodigy --help
# Test Nextflow
nextflow run main.nf --help
```
## Usage
### Basic Usage
```bash
# Run on a single PDB file
nextflow run main.nf --pdb /path/to/complex.pdb --outdir /path/to/output
# Run on multiple PDB files
nextflow run main.nf --pdb '/path/to/structures/*.pdb' --outdir /path/to/output
```
### With Custom Parameters
```bash
nextflow run main.nf \
--pdb '/path/to/structures/*.pdb' \
--outdir /path/to/output \
--distance_cutoff 5.5 \
--acc_threshold 0.05 \
--temperature 37.0 \
--contact_list true \
--pymol_selection true
```
### Chain Selection for Complex Interfaces
For antibody-antigen complexes or multi-chain proteins:
```bash
# Contacts between chains A and B only
nextflow run main.nf --pdb complex.pdb --selection 'A B'
# Heavy (H) and Light (L) chains as one molecule vs Antigen (A)
nextflow run main.nf --pdb antibody_antigen.pdb --selection 'H,L A'
# Three-way interface calculation
nextflow run main.nf --pdb complex.pdb --selection 'A B C'
```
### Using Singularity
```bash
nextflow run main.nf -profile singularity --pdb /path/to/complex.pdb
```
## Parameters
### Required Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--pdb` | Path to input PDB/mmCIF file(s). Supports glob patterns. | `/mnt/OmicNAS/private/old/olamide/Prodigy/input/*.pdb` |
| `--outdir` | Output directory for results | `/mnt/OmicNAS/private/old/olamide/Prodigy/output` |
### Analysis Parameters
| Parameter | Description | Default | Range |
|-----------|-------------|---------|-------|
| `--distance_cutoff` | Distance threshold (Å) for defining intermolecular contacts | `5.5` | 1.0 - 20.0 |
| `--acc_threshold` | Relative accessibility threshold for surface residue identification | `0.05` | 0.0 - 1.0 |
| `--temperature` | Temperature (°C) for Kd calculation | `25.0` | -273.15 - 100.0 |
| `--selection` | Chain selection for interface calculation | `''` (all chains) | See examples |
### Output Control Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--contact_list` | Generate detailed contact list file | `false` |
| `--pymol_selection` | Generate PyMOL visualization script | `false` |
| `--quiet` | Output only affinity values (minimal output) | `false` |
## Output Files
### Standard Output
For each input structure `<name>.pdb`, the pipeline generates:
| File | Description |
|------|-------------|
| `<name>_prodigy.txt` | Main results file with binding affinity prediction |
### Optional Output (when enabled)
| File | Description | Parameter |
|------|-------------|-----------|
| `<name>_contacts.txt` | List of all interface contacts | `--contact_list true` |
| `<name>_interface.pml` | PyMOL script for interface visualization | `--pymol_selection true` |
### Example Output
```
[!] Structure contains gaps:
E ILE16 < Fragment 0 > E ALA183
E TYR184 < Fragment 1 > E GLY187
[+] Executing 1 task(s) in total
##########################################
[+] Processing structure 1ppe_model0
[+] No. of intermolecular contacts: 86
[+] No. of charged-charged contacts: 5.0
[+] No. of charged-polar contacts: 10.0
[+] No. of charged-apolar contacts: 27.0
[+] No. of polar-polar contacts: 0.0
[+] No. of apolar-polar contacts: 20.0
[+] No. of apolar-apolar contacts: 24.0
[+] Percentage of apolar NIS residues: 34.10
[+] Percentage of charged NIS residues: 18.50
[++] Predicted binding affinity (kcal.mol-1): -14.7
[++] Predicted dissociation constant (M) at 25.0˚C: 1.6e-11
```
### Output Interpretation
| Metric | Description |
|--------|-------------|
| **Intermolecular contacts** | Total number of residue-residue contacts at interface |
| **Contact types** | Breakdown by residue character (charged/polar/apolar) |
| **NIS residues** | Composition of non-interacting surface |
| **Binding affinity (ΔG)** | Predicted free energy of binding (kcal/mol). More negative = stronger binding |
| **Dissociation constant (Kd)** | Predicted Kd at specified temperature. Lower = tighter binding |
### Binding Affinity Scale
| ΔG (kcal/mol) | Kd (M) | Binding Strength |
|---------------|--------|------------------|
| -6 to -8 | 10⁻⁵ to 10⁻⁶ | Moderate |
| -8 to -10 | 10⁻⁶ to 10⁻⁷ | Strong |
| -10 to -12 | 10⁻⁷ to 10⁻⁹ | Very Strong |
| < -12 | < 10⁻⁹ | Extremely Strong |
## Test Data
Download example protein complexes from the RCSB PDB:
```bash
# Create input directory
mkdir -p /mnt/OmicNAS/private/old/olamide/Prodigy/input
# Download test structures
wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/3bzd.pdb https://files.rcsb.org/download/3BZD.pdb
wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/2oob.pdb https://files.rcsb.org/download/2OOB.pdb
wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/1ppe.pdb https://files.rcsb.org/download/1PPE.pdb
```
### Expected Results
| Structure | Description | Expected ΔG (kcal/mol) |
|-----------|-------------|------------------------|
| 3BZD | Protein-protein complex | -9.4 |
| 2OOB | Protein-protein complex | -6.2 |
| 1PPE | Trypsin-inhibitor complex | -14.7 |
## Pipeline Structure
```
prodigy_pipeline/
├── Dockerfile # Docker image definition
├── main.nf # Nextflow pipeline script
├── nextflow.config # Pipeline configuration
├── params.json # Parameter documentation
└── README.md # This file
```
## Docker Image Details
The Docker image is based on Python 3.12 and includes:
- **prodigy-prot** (v2.4.0) - Main PRODIGY package
- **biopython** (≥1.80) - PDB structure parsing
- **freesasa** (≥2.2.1) - Solvent accessible surface area calculation
- **numpy** (≥2) - Numerical computations
### Building the Image
```bash
docker build -t prodigy:latest .
```
### Running Standalone
```bash
# Run PRODIGY directly
docker run --rm -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
# Get help
docker run --rm prodigy:latest prodigy --help
```
## Troubleshooting
### Common Issues
**1. Docker Hub Rate Limit Error**
```
ERROR: toomanyrequests: You have reached your pull rate limit
```
Solution: Log in to Docker Hub with `docker login` or wait and retry.
**2. Structure Contains Gaps Warning**
```
[!] Structure contains gaps
```
This is informational, not an error. PRODIGY handles missing residues automatically.
**3. No Intermolecular Contacts Found**
- Verify the structure contains multiple chains
- Check chain selection parameters
- Ensure chains are in contact (within distance cutoff)
**4. Permission Denied Errors**
```bash
# Run with user permissions
docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
```
### Getting Help
```bash
# PRODIGY help
docker run --rm prodigy:latest prodigy --help
# Nextflow pipeline help
nextflow run main.nf --help
```
## Citation
If you use this pipeline, please cite the following publications:
### PRODIGY Method
1. **Xue LC, Rodrigues JP, Kastritis PL, Bonvin AM, Vangone A.** (2016)
PRODIGY: a web server for predicting the binding affinity of protein-protein complexes.
*Bioinformatics*, 32(23):3676-3678.
[DOI: 10.1093/bioinformatics/btw514](https://doi.org/10.1093/bioinformatics/btw514)
2. **Vangone A, Bonvin AM.** (2015)
Contacts-based prediction of binding affinity in protein-protein complexes.
*eLife*, 4:e07454.
[DOI: 10.7554/eLife.07454](https://doi.org/10.7554/eLife.07454)
3. **Kastritis PL, Rodrigues JP, Folkers GE, Boelens R, Bonvin AM.** (2014)
Proteins feel more than they see: Fine-tuning of binding affinity by properties of the non-interacting surface.
*Journal of Molecular Biology*, 426(14):2632-2652.
[DOI: 10.1016/j.jmb.2014.04.017](https://doi.org/10.1016/j.jmb.2014.04.017)
### Software Dependencies
- **Nextflow**: Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. *Nature Biotechnology*, 35:316-319.
- **Biopython**: Cock PJ, et al. (2009) Biopython: freely available Python tools for computational molecular biology and bioinformatics. *Bioinformatics*, 25(11):1422-1423.
- **FreeSASA**: Mitternacht S. (2016) FreeSASA: An open source C library for solvent accessible surface area calculations. *F1000Research*, 5:189.
## License
This pipeline is distributed under the Apache License 2.0, consistent with the PRODIGY software license.
## Links
- **PRODIGY Web Server**: [https://wenmr.science.uu.nl/prodigy/](https://wenmr.science.uu.nl/prodigy/)
- **PRODIGY GitHub**: [https://github.com/haddocking/prodigy](https://github.com/haddocking/prodigy)
- **BonvinLab**: [https://www.bonvinlab.org/](https://www.bonvinlab.org/)
- **Nextflow**: [https://www.nextflow.io/](https://www.nextflow.io/)
## Support
For questions about:
- **PRODIGY method**: Contact the BonvinLab team at [ask.bioexcel.eu](https://ask.bioexcel.eu/)
- **This pipeline**: Open an issue in the repository
---
*Pipeline version: 2.4.0 | Last updated: January 2026*

13
examples/3BZD.ic_model Normal file
View File

@@ -0,0 +1,13 @@
[+] Reading structure file: /Users/joao/software/binding_affinity/examples/3BZD.pdb
[+] Parsed structure file 3BZD (2 chains, 343 residues)
[+] No. of intermolecular contacts: 51
[+] No. of charged-charged contacts: 4
[+] No. of charged-polar contacts: 7
[+] No. of charged-apolar contacts: 6
[+] No. of polar-polar contacts: 7
[+] No. of apolar-polar contacts: 15
[+] No. of apolar-apolar contacts: 12
[+] Percentage of apolar NIS residues: 29.48
[+] Percentage of charged NIS residues: 29.48
[++] Predicted binding affinity (kcal.mol-1): -9.373
[++] Predicted dissociation constant (M): 1.333e-07

2754
examples/3BZD.pdb Normal file

File diff suppressed because it is too large Load Diff

4727
examples/3bzd.cif Normal file

File diff suppressed because it is too large Load Diff

74
main.nf Normal file
View File

@@ -0,0 +1,74 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// Default parameters
params.pdb = 's3://omic/eureka/prodigy/input/*.pdb'
params.outdir = 's3://omic/eureka/prodigy/output'
params.distance_cutoff = 5.5
params.acc_threshold = 0.05
params.temperature = 25.0
params.selection = ''
params.contact_list = false
params.pymol_selection = false
params.quiet = false
// =============================================================================
// Process: PRODIGY
// Predicts binding affinity using intermolecular contacts
// =============================================================================
process PRODIGY {
container 'harbor.cluster.omic.ai/omic/prodigy:latest'
publishDir params.outdir, mode: 'copy'
stageInMode 'copy'
input:
path pdb
output:
path "${pdb.baseName}_prodigy.txt", emit: results
path "${pdb.baseName}_contacts.txt", optional: true, emit: contacts
path "${pdb.baseName}_interface.pml", optional: true, emit: pymol
script:
"""
prodigy \\
${pdb} \\
--distance-cutoff ${params.distance_cutoff} \\
--acc-threshold ${params.acc_threshold} \\
--temperature ${params.temperature} \\
${params.selection ? '--selection ' + params.selection : ''} \\
${params.contact_list ? '--contact_list' : ''} \\
${params.pymol_selection ? '--pymol_selection' : ''} \\
${params.quiet ? '--quiet' : ''} \\
2>&1 | tee ${pdb.baseName}_prodigy.txt
# Rename contact list file if generated
if [ -f "${pdb.baseName}.contacts" ]; then
mv ${pdb.baseName}.contacts ${pdb.baseName}_contacts.txt
fi
# Rename PyMOL script if generated
if [ -f "${pdb.baseName}.pml" ]; then
mv ${pdb.baseName}.pml ${pdb.baseName}_interface.pml
fi
"""
}
// =============================================================================
// Workflow
// =============================================================================
workflow {
// Validate input
if (!params.pdb) {
error "ERROR: Please provide input PDB file(s) using --pdb parameter"
}
// Create input channel
pdb_ch = Channel.fromPath(params.pdb, checkIfExists: true)
// Run PRODIGY
PRODIGY(pdb_ch)
}

71
nextflow.config Normal file
View File

@@ -0,0 +1,71 @@
// =============================================================================
// PRODIGY Nextflow Pipeline Configuration
// Protein binding affinity prediction from structural data
// =============================================================================
// Manifest for Nextflow metadata
manifest {
name = 'PRODIGY-Nextflow'
author = 'Olamide'
homePage = 'https://trs-gitea.cluster.omic.ai/omic/prodigy'
description = 'Nextflow pipeline for PRODIGY - Protein binding affinity prediction based on intermolecular contacts'
mainScript = 'main.nf'
version = '2.4.0'
}
// Global default parameters
params {
pdb = 's3://omic/eureka/prodigy/input/*.pdb'
outdir = 's3://omic/eureka/prodigy/output'
distance_cutoff = 5.5
acc_threshold = 0.05
temperature = 25.0
selection = ''
contact_list = false
pymol_selection = false
quiet = false
}
// Container configurations
docker {
enabled = true
runOptions = '-u $(id -u):$(id -g)'
}
// Process configurations
process {
cpus = 1
memory = '4 GB'
container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
}
// Execution configurations
executor {
$local {
cpus = 4
memory = '8 GB'
}
}
// Profiles for different execution environments
profiles {
standard {
docker.enabled = true
}
k8s {
docker.enabled = true
process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
}
k8s_gpu {
docker.enabled = true
process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
docker.enabled = false
}
}

157
params.json Normal file
View File

@@ -0,0 +1,157 @@
{
"params": {
"pdb": {
"type": "file",
"description": "Path to input PDB or mmCIF structure file(s) for binding affinity prediction",
"default": "s3://omic/eureka/prodigy/input/*.pdb",
"required": true,
"pipeline_io": "input",
"var_name": "params.pdb",
"examples": [
"s3://omic/eureka/prodigy/input/3bzd.pdb",
"s3://omic/eureka/prodigy/input/*.pdb"
],
"pattern": ".*\\.(pdb|cif)$",
"enum": [],
"validation": {},
"notes": "Input protein-protein complex structure in PDB or mmCIF format. Can be a single file or glob pattern for batch processing."
},
"outdir": {
"type": "folder",
"description": "Directory for PRODIGY prediction results",
"default": "s3://omic/eureka/prodigy/output",
"required": true,
"pipeline_io": "output",
"var_name": "params.outdir",
"examples": [
"s3://omic/eureka/prodigy/output",
"s3://omic/eureka/prodigy/custom_output"
],
"pattern": ".*",
"enum": [],
"validation": {},
"notes": "Directory where prediction results will be stored. Created if it does not exist."
},
"distance_cutoff": {
"type": "float",
"description": "Distance cutoff (Angstrom) for calculating intermolecular contacts",
"default": 5.5,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.distance_cutoff",
"examples": [
5.5,
4.0,
6.0
],
"pattern": null,
"enum": [],
"validation": {
"min": 1.0,
"max": 20.0
},
"notes": "Default value of 5.5 Angstrom was optimized in Vangone & Bonvin (2015) eLife. This threshold includes different non-bonded interactions including salt bridges."
},
"acc_threshold": {
"type": "float",
"description": "Accessibility threshold for buried surface area (BSA) analysis",
"default": 0.05,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.acc_threshold",
"examples": [
0.05,
0.1
],
"pattern": null,
"enum": [],
"validation": {
"min": 0.0,
"max": 1.0
},
"notes": "Relative accessibility threshold used to identify surface residues for non-interacting surface (NIS) calculations."
},
"temperature": {
"type": "float",
"description": "Temperature (Celsius) for dissociation constant (Kd) prediction",
"default": 25.0,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.temperature",
"examples": [
25.0,
37.0,
4.0
],
"pattern": null,
"enum": [],
"validation": {
"min": -273.15,
"max": 100.0
},
"notes": "Temperature used to convert predicted binding free energy (deltaG) to dissociation constant (Kd)."
},
"selection": {
"type": "string",
"description": "Chain selection for interface calculation",
"default": "",
"required": false,
"pipeline_io": "parameter",
"var_name": "params.selection",
"examples": [
"A B",
"A,B C",
"H,L A"
],
"pattern": null,
"enum": [],
"validation": {},
"notes": "Specify chains to consider for binding affinity calculation. Format: 'A B' calculates contacts between chains A and B. 'A,B C' treats chains A and B as one molecule interacting with chain C. Useful for antibody-antigen complexes where heavy and light chains should be grouped."
},
"contact_list": {
"type": "boolean",
"description": "Output list of intermolecular contacts",
"default": false,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.contact_list",
"examples": [
true,
false
],
"enum": [true, false],
"validation": {},
"notes": "When enabled, outputs a detailed list of all residue-residue contacts at the interface."
},
"pymol_selection": {
"type": "boolean",
"description": "Output PyMOL script to visualize interface",
"default": false,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.pymol_selection",
"examples": [
true,
false
],
"enum": [true, false],
"validation": {},
"notes": "When enabled, generates a PyMOL script (.pml) to highlight interface residues for visualization."
},
"quiet": {
"type": "boolean",
"description": "Output only predicted affinity values",
"default": false,
"required": false,
"pipeline_io": "parameter",
"var_name": "params.quiet",
"examples": [
true,
false
],
"enum": [true, false],
"validation": {},
"notes": "When enabled, outputs only the predicted binding affinity value without detailed analysis. Useful for batch processing and downstream parsing."
}
}
}

45
pyproject.toml Normal file
View File

@@ -0,0 +1,45 @@
[project]
name = "prodigy-prot"
license = "Apache-2.0"
version = "2.4.0"
description = "PROtein binDIng enerGY prediction"
authors = [
{ name = "Anna Vangone" },
{ name = "Joao Rodrigues" },
{ name = "Joerg Schaarschmidt" },
]
maintainers = [{ name = "BonvinLab", email = "bonvinlab.support@uu.nl" }]
readme = "README.md"
classifiers = [
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering :: Chemistry",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
dependencies = ["biopython>=1.80", "freesasa>=2.2.1", "numpy>=2"]
[project.optional-dependencies]
dev = ["pytest", "coverage", "hypothesis", "pytest-cov", "mypy"]
[project.scripts]
prodigy = "prodigy_prot.cli:main"
[tool.setuptools]
include-package-data = true
packages = ["src"]
[tool.pytest.ini_options]
pythonpath = ["src"]
markers = ["integration: marks tests as integration tests"]
[tool.mypy]
disable_error_code = ["import-not-found"]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

View File

@@ -0,0 +1,3 @@
from pathlib import Path
NACCESS_CONFIG = Path(Path(__file__).parents[0], "data/naccess.config")

199
src/prodigy_prot/cli.py Normal file
View File

@@ -0,0 +1,199 @@
"""
Binding affinity predictor based on Intermolecular Contacts (ICs).
"""
import argparse
import logging
import sys
from argparse import RawTextHelpFormatter
from concurrent.futures import ProcessPoolExecutor, as_completed
from io import StringIO
from pathlib import Path
from Bio.PDB.Model import Model
from prodigy_prot.modules.parsers import parse_structure
from prodigy_prot.modules.prodigy import Prodigy
# setup logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
log = logging.getLogger("Prodigy")
ap = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
ap.add_argument(
"input_path",
help="Path to either: \n- Structure in PDB or mmCIF format\n- Directory containing structure files",
)
ap.add_argument(
"--distance-cutoff",
type=float,
default=5.5,
help="Distance cutoff to calculate ICs",
)
ap.add_argument(
"--acc-threshold",
type=float,
default=0.05,
help="Accessibility threshold for BSA analysis",
)
ap.add_argument(
"--temperature",
type=float,
default=25.0,
help="Temperature (C) for Kd prediction",
)
ap.add_argument("--contact_list", action="store_true", help="Output a list of contacts")
ap.add_argument(
"--pymol_selection",
action="store_true",
help="Output a script to highlight the interface (pymol)",
)
ap.add_argument(
"-q",
"--quiet",
action="store_true",
help="Outputs only the predicted affinity value",
)
ap.add_argument(
"-s",
"--showall",
action="store_true",
help="Outputs all original prodigy features but BSA (mutually exclusive with `-q`)",
)
ap.add_argument(
"-np",
"--number-of-processors",
type=int,
action="store",
help="Number of processors to use (default: 1)",
default=1,
)
_co_help = """
By default, all intermolecular contacts are taken into consideration,
a molecule being defined as an isolated group of amino acids sharing
a common chain identifier. In specific cases, for example
antibody-antigen complexes, some chains should be considered as a
single molecule.
Use the --selection option to provide collections of chains that should
be considered for the calculation. Separate by a space the chains that
are to be considered _different_ molecules. Use commas to include multiple
chains as part of a single group:
--selection A B => Contacts calculated (only) between chains A and B.
--selection A,B C => Contacts calculated (only) between \
chains A and C; and B and C.
--selection A B C => Contacts calculated (only) between \
chains A and B; B and C; and A and C.
"""
sel_opt = ap.add_argument_group("Selection Options", description=_co_help)
sel_opt.add_argument("--selection", nargs="+", metavar=("A B", "A,B C"))
def main():
args = ap.parse_args()
log.setLevel(logging.ERROR if args.quiet else logging.INFO)
if args.quiet and args.showall:
log.error("Error: --quiet (-q) and --showall (-s) are mutually exclusive arguments")
sys.exit(1)
log.setLevel(logging.ERROR if args.quiet else logging.INFO)
struct_path = Path(args.input_path)
input_list = []
if struct_path.is_file():
input_list.append(struct_path)
elif struct_path.is_dir():
for input_f in struct_path.glob("*"):
if Path(input_f).suffix in [".pdb", ".cif", ".ent"]:
input_list.append(input_f)
elif not struct_path.exists():
log.error(f"File {struct_path} does not exist")
sys.exit(1)
else:
log.error(f"Input path {struct_path} is neither a valid file nor a directory")
sys.exit(1)
# Collect all tasks
tasks = []
for input_f in input_list:
models, _, _ = parse_structure(str(input_f))
struct_path = Path(input_f)
for model in models:
identifier = f"{struct_path.stem}_model{model.id}"
tasks.append((model, identifier, args, struct_path))
# Execute in parallel
total_tasks = len(tasks)
if total_tasks == 0:
log.error("No valid structures found")
sys.exit(1)
max_workers = min(args.number_of_processors, total_tasks)
log.info(f"[+] Executing {total_tasks} task(s) in total")
if max_workers != args.number_of_processors:
log.info("[+] Adjusting number of processors based on number of tasks")
log.info(
f"[+] Using {max_workers} processor(s) instead of {args.number_of_processors}"
)
# Execute and collect results
results = []
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_model, *task) for task in tasks]
for future in as_completed(futures):
try:
result = future.result()
results.append(result)
except Exception as e:
log.error(f"Error processing model: {e}")
# Sort by identifier, then model.id
results.sort(key=lambda x: (x[0], x[1]))
# Print all outputs sequentially
for identifier, _, output in results:
print(output, end="")
def process_model(model: Model, identifier: str, args: argparse.Namespace, struct_path):
"""Process a single model"""
# Capture stdout
output_buffer = StringIO()
old_stdout = sys.stdout
sys.stdout = output_buffer
try:
if not args.quiet:
print("#" * 42)
print(f"[+] Processing structure {identifier}")
prodigy = Prodigy(
model=model,
name=identifier,
selection=args.selection,
temp=args.temperature,
)
prodigy.predict(
distance_cutoff=args.distance_cutoff, acc_threshold=args.acc_threshold
)
prodigy.print_prediction(quiet=args.quiet, showall=args.showall)
finally:
sys.stdout = old_stdout
if args.contact_list:
contact_list_f = struct_path.with_suffix(".ic")
prodigy.print_contacts(outfile=str(contact_list_f))
if args.pymol_selection:
pymol_script_f = struct_path.with_suffix(".pml")
prodigy.print_pymol_script(outfile=str(pymol_script_f))
return identifier, model.id, output_buffer.getvalue()
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,256 @@
# Contributed by João Rodrigues
name: NACCESS
types:
C_ALI 1.87 apolar
C_CAR 1.76 apolar
C_NUC 1.80 apolar
N_AMN 1.50 polar
N_AMD 1.65 polar
N_NUC 1.60 polar
O 1.40 polar
S 1.85 apolar
SE 1.80 apolar
P 1.90 apolar
atoms:
ANY C C_CAR
ANY O O
ANY CA C_ALI
ANY N N_AMD
ANY CB C_ALI
ANY OXT O
# nucleic acid
ANY P P
ANY OP1 O
ANY OP2 O
ANY OP3 O
ANY O5' O
ANY O4' O
ANY O3' O
ANY O2' O
ANY C5' C_NUC
ANY C4' C_NUC
ANY C3' C_NUC
ANY C2' C_NUC
ANY C1' C_NUC
ALA CB C_ALI # included so that RSA values will be generated
ARG CG C_ALI
ARG CD C_ALI
ARG NE N_AMD
ARG CZ C_CAR
ARG NH1 N_AMD
ARG NH2 N_AMD
ASN CG C_CAR
ASN OD1 O
ASN ND2 N_AMD
ASP CG C_CAR
ASP OD1 O
ASP OD2 O
CYS SG S
GLN CG C_ALI
GLN CD C_CAR
GLN OE1 O
GLN NE2 N_AMD
GLU CG C_ALI
GLU CD C_CAR
GLU OE1 O
GLU OE2 O
GLY CA C_ALI # included so that RSA values will be generated
HIS CG C_CAR
HIS ND1 N_AMD
HIS CD2 C_CAR
HIS NE2 N_AMD
HIS CE1 C_CAR
ILE CG1 C_ALI
ILE CG2 C_ALI
ILE CD1 C_ALI
LEU CG C_ALI
LEU CD1 C_ALI
LEU CD2 C_ALI
LYS CG C_ALI
LYS CD C_ALI
LYS CE C_ALI
LYS NZ N_AMN
MET CG C_ALI
MET SD S
MET CE C_ALI
PHE CG C_CAR
PHE CD1 C_CAR
PHE CD2 C_CAR
PHE CE1 C_CAR
PHE CE2 C_CAR
PHE CZ C_CAR
PRO CG C_ALI
PRO CD C_ALI
SEC SE SE
SER OG O
THR OG1 O
THR CG2 C_ALI
TRP CG C_CAR
TRP CD1 C_CAR
TRP CD2 C_CAR
TRP NE1 N_AMD
TRP CE2 C_CAR
TRP CE3 C_CAR
TRP CZ2 C_CAR
TRP CZ3 C_CAR
TRP CH2 C_CAR
TYR CG C_CAR
TYR CD1 C_CAR
TYR CD2 C_CAR
TYR CE1 C_CAR
TYR CE2 C_CAR
TYR CZ C_CAR
TYR OH O
VAL CG1 C_ALI
VAL CG2 C_ALI
A N9 N_NUC
A C8 C_NUC
A N7 N_NUC
A C5 C_NUC
A C6 C_NUC
A N6 N_NUC
A N1 N_NUC
A C2 C_NUC
A N3 N_NUC
A C4 C_NUC
C N1 N_NUC
C C2 C_NUC
C O2 O
C N3 N_NUC
C C4 C_NUC
C N4 N_NUC
C C5 C_NUC
C C6 C_NUC
G N9 N_NUC
G C8 C_NUC
G N7 N_NUC
G C5 C_NUC
G C6 C_NUC
G O6 O
G N1 N_NUC
G C2 C_NUC
G N2 N_NUC
G N3 N_NUC
G C4 C_NUC
I N9 N_NUC
I C8 C_NUC
I N7 N_NUC
I C5 C_NUC
I C6 C_NUC
I O6 O
I N1 N_NUC
I C2 C_NUC
I N3 N_NUC
I C4 C_NUC
T N1 N_NUC
T C2 C_NUC
T O2 O
T N3 N_NUC
T C4 C_NUC
T O4 O
T C5 C_NUC
T C7 C_NUC
T C6 C_NUC
U N1 N_NUC
U C2 C_NUC
U O2 O
U N3 N_NUC
U C4 C_NUC
U O4 O
U C5 C_NUC
U C6 C_NUC
DA N9 N_NUC
DA C8 C_NUC
DA N7 N_NUC
DA C5 C_NUC
DA C6 C_NUC
DA N6 N_NUC
DA N1 N_NUC
DA C2 C_NUC
DA N3 N_NUC
DA C4 C_NUC
DC N1 N_NUC
DC C2 C_NUC
DC O2 O
DC N3 N_NUC
DC C4 C_NUC
DC N4 N_NUC
DC C5 C_NUC
DC C6 C_NUC
DG N9 N_NUC
DG C8 C_NUC
DG N7 N_NUC
DG C5 C_NUC
DG C6 C_NUC
DG O6 O
DG N1 N_NUC
DG C2 C_NUC
DG N2 N_NUC
DG N3 N_NUC
DG C4 C_NUC
DI N9 N_NUC
DI C8 C_NUC
DI N7 N_NUC
DI C5 C_NUC
DI C6 C_NUC
DI O6 O
DI N1 N_NUC
DI C2 C_NUC
DI N3 N_NUC
DI C4 C_NUC
DT N1 N_NUC
DT C2 C_NUC
DT O2 O
DT N3 N_NUC
DT C4 C_NUC
DT O4 O
DT C5 C_NUC
DT C7 C_NUC
DT C6 C_NUC
DU N1 N_NUC
DU C2 C_NUC
DU O2 O
DU N3 N_NUC
DU C4 C_NUC
DU O4 O
DU C5 C_NUC
DU C6 C_NUC

View File

View File

@@ -0,0 +1,148 @@
"""
Generic properties of amino acids required for the binding affinity
prediction methods.
"""
aa_character_ic: dict[str, str] = {
"ALA": "A",
"CYS": "A", # ?
"GLU": "C",
"ASP": "C",
"GLY": "A",
"PHE": "A",
"ILE": "A",
"HIS": "C",
"LYS": "C",
"MET": "A",
"LEU": "A",
"ASN": "P",
"GLN": "P",
"PRO": "A",
"SER": "P",
"ARG": "C",
"THR": "P",
"TRP": "A",
"VAL": "A",
"TYR": "A",
}
aa_character_protorp: dict[str, str] = {
"ALA": "A",
"CYS": "P",
"GLU": "C",
"ASP": "C",
"GLY": "A",
"PHE": "A",
"ILE": "A",
"HIS": "P",
"LYS": "C",
"MET": "A",
"LEU": "A",
"ASN": "P",
"GLN": "P",
"PRO": "A",
"SER": "P",
"ARG": "C",
"THR": "P",
"TRP": "P",
"VAL": "A",
"TYR": "P",
}
# Taken from pre-original prodigy code
# B for hydrophoBic
# Y for hydrophiLic
aa_character_hydro: dict[str, str] = {
"ALA": "B", #+
"CYS": "B", #+
"GLU": "L", #+
"ASP": "L", #+
"GLY": "L", # Glycine was B in my initial classification
"PHE": "B", #+
"ILE": "B", #+
"HIS": "L", #+
"LYS": "L", #+
"MET": "B", #+
"LEU": "B", #+
"ASN": "L", #+
"GLN": "L", #+
"PRO": "L", # Proline was B my initial classification
"SER": "L", #+
"ARG": "L", #+
"THR": "L", #+
"TRP": "L", #+
"VAL": "B", #+
"TYR": "L", #+
}
# Scaling factors for relative ASA
# Calculated using extended ALA-X-ALA peptides
# Taken from NACCESS
rel_asa: dict[str, dict[str, float]] = {
"total": {
"ALA": 107.95,
"CYS": 134.28,
"ASP": 140.39,
"GLU": 172.25,
"PHE": 199.48,
"GLY": 80.10,
"HIS": 182.88,
"ILE": 175.12,
"LYS": 200.81,
"LEU": 178.63,
"MET": 194.15,
"ASN": 143.94,
"PRO": 136.13,
"GLN": 178.50,
"ARG": 238.76,
"SER": 116.50,
"THR": 139.27,
"VAL": 151.44,
"TRP": 249.36,
"TYR": 212.76,
},
"bb": {
"ALA": 38.54,
"CYS": 37.53,
"ASP": 37.70,
"GLU": 37.51,
"PHE": 35.37,
"GLY": 47.77,
"HIS": 35.80,
"ILE": 37.16,
"LYS": 37.51,
"LEU": 37.51,
"MET": 37.51,
"ASN": 37.70,
"PRO": 16.23,
"GLN": 37.51,
"ARG": 37.51,
"SER": 38.40,
"THR": 37.57,
"VAL": 37.16,
"TRP": 38.10,
"TYR": 35.38,
},
"sc": {
"ALA": 69.41,
"CYS": 96.75,
"ASP": 102.69,
"GLU": 134.74,
"PHE": 164.11,
"GLY": 32.33,
"HIS": 147.08,
"ILE": 137.96,
"LYS": 163.30,
"LEU": 141.12,
"MET": 156.64,
"ASN": 106.24,
"PRO": 119.90,
"GLN": 140.99,
"ARG": 201.25,
"SER": 78.11,
"THR": 101.70,
"VAL": 114.28,
"TRP": 211.26,
"TYR": 177.38,
},
}

View File

@@ -0,0 +1,71 @@
"""
Functions to execute freesasa and parse its output.
"""
import os
import freesasa
from Bio.PDB.Model import Model
from Bio.PDB.Structure import Structure
from freesasa import Classifier, calc, structureFromBioPDB
from prodigy_prot import NACCESS_CONFIG
from prodigy_prot.modules.aa_properties import rel_asa
freesasa.setVerbosity(freesasa.nowarnings)
def execute_freesasa_api(model: Model) -> tuple[dict, dict]:
"""
Calls freesasa using its Python API and returns
per-residue accessibilities.
"""
asa_data = {}
rsa_data: dict[tuple[str, int, str], float] = {}
_rsa: dict = rel_asa["total"]
classifier = Classifier(str(NACCESS_CONFIG))
# NOTE: `structureFromBioPDB` requires a Structure object
# so here build one from a model
s = Structure(model.id)
s.add(model)
try:
struct = structureFromBioPDB(
s,
classifier,
)
result = calc(struct)
except AssertionError as e:
error_message = "" + os.linesep
error_message += "[!] Error when running freesasa:" + os.linesep
error_message += f"[!] {e}" + os.linesep
error_message += (
"[!] Make sure the atom names in your PDB file match"
" the canonical naming and belong "
"to default residues" + os.linesep
)
print(error_message)
raise Exception(error_message)
# iterate over all atoms to get SASA and residue name
for idx in range(struct.nAtoms()):
atname = struct.atomName(idx)
resname = struct.residueName(idx)
resid = struct.residueNumber(idx)
chain = struct.chainLabel(idx)
at_uid = (chain, resname, resid, atname)
res_uid = (chain, resname, resid)
asa = result.atomArea(idx)
asa_data[at_uid] = asa
# add asa to residue
rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa
# convert total asa ro relative asa
rsa_data.update(
(res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items()
)
return asa_data, rsa_data

View File

@@ -0,0 +1,41 @@
"""
Models to predict binding affinity based on molecular properties.
"""
def IC_NIS(
ic_cc: float,
ic_ca: float,
ic_pp: float,
ic_pa: float,
p_nis_a: float,
p_nis_c: float,
) -> float:
"""
Calculates the predicted binding affinity value
based on the IC-NIS model.
"""
return (
-0.09459 * ic_cc
+ -0.10007 * ic_ca
+ 0.19577 * ic_pp
+ -0.22671 * ic_pa
+ 0.18681 * p_nis_a
+ 0.13810 * p_nis_c
+ -15.9433
)
def NIS(p_nis_c: float, p_nis_p: float, n_int_atoms: float) -> float:
"""
Calculates the predicted binding affinity value
based on the NIS model.
"""
return (
0.0856851248873 * p_nis_p
+ -0.0685254498746 * p_nis_c
+ 0.0261591389985 * n_int_atoms
+ 3.0124939659498
)

View File

@@ -0,0 +1,187 @@
"""
Functions to read PDB/mmCIF files
"""
import logging
import sys
import typing
import warnings
from pathlib import Path
from typing import Optional, Union
from Bio.PDB.Atom import DisorderedAtom
from Bio.PDB.Chain import Chain
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.Model import Model
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder, is_aa
from Bio.PDB.Structure import Structure
warnings.filterwarnings("ignore", category=PDBConstructionWarning)
log = logging.getLogger("Prodigy")
def get_parser(input_f: Path) -> Union[PDBParser, MMCIFParser]:
if input_f.suffix == ".cif":
return MMCIFParser()
else:
return PDBParser()
def ignore(r):
return r.id[0][0] == "W" or r.id[0][0] == "H"
def validate_structure(
input_strcture_obj: Structure,
selection: Optional[list[str]] = None,
clean: bool = True,
) -> list[Model]:
result: list[Model] = []
for model in [m for m in input_strcture_obj.child_list]:
# process selected chains
chains: list[Chain] = list(model.get_chains())
chain_ids = set([c.id for c in chains])
if selection:
sel_chains = []
# Match selected chain with structure
for sel in selection:
for c_str in sel.split(","):
sel_chains.append(c_str)
if c_str not in chain_ids:
raise ValueError(
f"Selected chain not present in provided structure: {c_str}"
)
# Remove unselected chains
def _ignore_helper(x) -> bool:
return x.id not in sel_chains
for c in chains:
if _ignore_helper(c):
if c.parent is not None:
c.parent.detach_child(c.id)
# Double occupancy check
for atom in list(model.get_atoms()):
if atom.is_disordered():
atom = typing.cast(DisorderedAtom, atom)
residue = atom.parent
assert residue is not None
sel_at = atom.selected_child
assert sel_at is not None
sel_at.altloc = " "
sel_at.disordered_flag = 0
residue.detach_child(atom.id)
residue.add(sel_at)
# Insertion code check
for c in chains:
for residue in c.get_residues():
if residue.get_id()[2] != " ":
c.detach_child(residue.id)
if clean:
# Remove HETATMs and solvent
res_list = list(model.get_residues())
for res in res_list:
if ignore(res):
chain = res.parent
assert chain is not None
chain.detach_child(res.id)
elif not is_aa(res, standard=True):
raise ValueError(
"Unsupported non-standard amino acid found: {0}".format(
res.resname
)
)
# Remove Hydrogens
atom_list = list(model.get_atoms())
def _ignore(x):
return x.element == "H"
for atom in atom_list:
if _ignore(atom):
residue = atom.parent
assert residue is not None
residue.detach_child(atom.name)
# Detect gaps and compare with no. of chains
pep_builder = PPBuilder()
peptides = pep_builder.build_peptides(model)
n_peptides = len(peptides)
if n_peptides != len(chain_ids):
message = "[!] Structure contains gaps:\n"
for i_pp, pp in enumerate(peptides):
message += (
"\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > "
"{2.parent.id} {2.resname}{2.id[1]}\n".format(i_pp, pp[0], pp[-1])
)
log.warning(message)
result.append(model)
return result
def parse_structure(path: str) -> tuple[list[Model], int, int]:
"""Return a validated `Structure`, number of chains and number of residues"""
extension = Path(path).suffix
supported_extensions = [".pdb", ".cif", ".ent"]
if extension not in supported_extensions:
log.error(
f"[!] Structure format '{extension}' is "
"not supported. Use '.pdb' or '.cif'."
)
sys.exit(1)
parser = get_parser(Path(path))
structure_name = Path(path).stem
structure_path = Path(path)
try:
original_structure = parser.get_structure(structure_name, structure_path)
except Exception as e:
log.exception(e)
sys.exit(1)
assert isinstance(original_structure, Structure)
models: list[Model] = validate_structure(original_structure)
# Get number of chains
chain_dict = {}
res_dict = {}
for model in models:
chain_dict.update({c.id: c for c in model.get_chains()})
res_dict.update({r.id: r for r in model.get_residues()})
## Make sure all models have the same chains
# Get chain sets for all models
chain_sets = [set(chain.id for chain in model.get_chains()) for model in models]
# Check if all sets are identical
if not all(chain_set == chain_sets[0] for chain_set in chain_sets):
raise ValueError(
"Not all models have the same chains. Found chain sets: "
+ ", ".join(str(s) for s in chain_sets)
)
res_sets = [set(res.id for res in model.get_residues()) for model in models]
if not all(res_set == res_sets[0] for res_set in res_sets):
raise ValueError(
"Not all models have the same residues. Found residue sets: "
+ ", ".join(str(s) for s in res_sets)
)
# structure, n_chains, n_res = parse_structure(path=str(struct_path))
return (models, len(chain_sets[0]), len(res_sets[0]))

View File

@@ -0,0 +1,301 @@
import sys
from io import TextIOWrapper
from typing import Optional, TextIO, Union
from Bio.PDB.Model import Model
from Bio.PDB.NeighborSearch import NeighborSearch
#from Bio.PDB.Structure import Structure
from prodigy_prot.modules import aa_properties
from prodigy_prot.modules.freesasa_tools import execute_freesasa_api
from prodigy_prot.modules.models import IC_NIS
from prodigy_prot.modules.utils import dg_to_kd
def calculate_ic(
model: Model, d_cutoff: float = 5.5, selection: Optional[dict[str, int]] = None
) -> list:
"""
Calculates intermolecular contacts in a parsed struct object.
"""
atom_list = list(model.get_atoms())
ns = NeighborSearch(atom_list)
all_list = ns.search_all(radius=d_cutoff, level="R")
assert all_list is not None
if selection:
_sd = selection
def _chain(x):
return x.parent.id
ic_list = [
c
for c in all_list
if (_chain(c[0]) in _sd and _chain(c[1]) in _sd)
and (_sd[_chain(c[0])] != _sd[_chain(c[1])])
]
else:
ic_list = [c for c in all_list if c[0].parent.id != c[1].parent.id]
if not ic_list:
raise ValueError("No contacts found for selection")
ic_list.sort()
return ic_list
def analyse_contacts(contact_list: list) -> dict[str, float]:
"""
Enumerates and classifies contacts based on the chemical characteristics
of the participating amino acids.
"""
bins = {
"AA": 0.0,
"PP": 0.0,
"CC": 0.0,
"AP": 0.0,
"CP": 0.0,
"AC": 0.0,
"LL": 0.0,
"BL": 0.0,
"BB": 0.0
}
_data = aa_properties.aa_character_ic
for res_i, res_j in contact_list:
i = _data.get(res_i.resname)
j = _data.get(res_j.resname)
if i is not None and j is not None:
contact_type = "".join(sorted((i, j)))
bins[contact_type] += 1
_data = aa_properties.aa_character_hydro
for res_i, res_j in contact_list:
i = _data.get(res_i.resname)
j = _data.get(res_j.resname)
if i is not None and j is not None:
contact_type = "".join(sorted((i, j)))
bins[contact_type] += 1
return bins
def analyse_nis(sasa_dict: dict, acc_threshold: float = 0.05) -> list[float]:
"""
Returns the percentages of apolar, polar, and charged
residues at the interface, according to an accessibility
criterion.
"""
_data = aa_properties.aa_character_protorp
def _char_to_index(x):
return {"A": 0, "C": 1, "P": 2}.get(x)
count = [0, 0, 0]
for res, rsa in sasa_dict.items():
_, resn, _ = res
if rsa >= acc_threshold:
aa_character = _data[resn]
aa_index = _char_to_index(aa_character)
assert aa_index is not None
count[aa_index] += 1
percentages = [100.0 * x / sum(count) for x in count]
return percentages
class Prodigy:
# init parameters
def __init__(
self,
model: Model,
name: str = "",
selection: Optional[list[str]] = None,
temp: float = 25.0,
):
self.temp = float(temp)
if selection is None:
self.selection = [chain.id for chain in model.get_chains()]
else:
self.selection = selection
self.model = model
self.name = name
self.ic_network: list = []
self.bins: dict[str, float] = {
"CC": 0.0,
"CP": 0.0,
"AC": 0.0,
"PP": 0.0,
"AP": 0.0,
"AA": 0.0,
"LL": 0.0,
"BL": 0.0,
"BB": 0.0
}
self.nis_a = 0.0
self.nis_c = 0.0
self.nis_p = 0.0
self.ba_val = 0.0
self.kd_val = 0.0
def predict(
self,
temp: Optional[float] = None,
distance_cutoff: float = 5.5,
acc_threshold: float = 0.05,
):
if temp is not None:
self.temp = temp
# Make selection dict from user option or PDB chains
selection_dict: dict[str, int] = {}
for igroup, group in enumerate(self.selection):
chains = group.split(",")
for chain in chains:
if chain in selection_dict:
errmsg = "Selections must be disjoint sets: " f"{chain} is repeated"
raise ValueError(errmsg)
selection_dict[chain] = igroup
# Contacts
self.ic_network = calculate_ic(
self.model, d_cutoff=distance_cutoff, selection=selection_dict
)
self.bins = analyse_contacts(self.ic_network)
# SASA
_, cmplx_sasa = execute_freesasa_api(self.model)
self.nis_a, self.nis_c, self.nis_p = analyse_nis(cmplx_sasa, acc_threshold=acc_threshold)
# Affinity Calculation
self.ba_val = IC_NIS(
self.bins["CC"],
self.bins["AC"],
self.bins["PP"],
self.bins["AP"],
self.nis_a,
self.nis_c,
)
self.kd_val = dg_to_kd(self.ba_val, self.temp)
def as_dict(self) -> dict:
return_dict = {
"model": self.model.id,
"selection": self.selection,
"temp": self.temp,
"ICs": len(self.ic_network),
"nis_a": self.nis_a,
"nis_c": self.nis_c,
"nis_p": self.nis_p,
"ba_val": self.ba_val,
"kd_val": self.kd_val,
}
return_dict.update(self.bins)
return return_dict
def print_prediction(self, outfile: str = "", quiet: bool = False, showall: bool = False) -> None:
handle: Union[TextIOWrapper, TextIO]
if outfile:
handle = open(outfile, "w")
else:
handle = sys.stdout
if quiet:
handle.write("{0}\t{1:8.3f}\n".format(self.name, self.ba_val))
else:
# Collect output lines in order
lines = []
lines.append(f"[+] No. of intermolecular contacts: {len(self.ic_network)}\n")
lines.append(f"[+] No. of Charged-Charged contacts: {self.bins['CC']}\n")
lines.append(f"[+] No. of Charged-Polar contacts: {self.bins['CP']}\n")
lines.append(f"[+] No. of Charged-Apolar contacts: {self.bins['AC']}\n")
lines.append(f"[+] No. of Polar-Polar contacts: {self.bins['PP']}\n")
lines.append(f"[+] No. of Apolar-Polar contacts: {self.bins['AP']}\n")
lines.append(f"[+] No. of Apolar-Apolar contacts: {self.bins['AA']}\n")
if showall:
lines.append(f"[+] No. of hydrophiLic-hydrophiLic contacts: {self.bins['LL']}\n")
lines.append(f"[+] No. of hydrophoBic-hydrophiLic contacts: {self.bins['BL']}\n")
lines.append(f"[+] No. of hydrophoBic-hydrophoBic contacts: {self.bins['BB']}\n")
lines.append(f"[+] Percentage of Polar NIS residues: {self.nis_p:3.2f}\n")
lines.append(f"[+] Percentage of Apolar NIS residues: {self.nis_a:3.2f}\n")
lines.append(f"[+] Percentage of Charged NIS residues: {self.nis_c:3.2f}\n")
lines.append(f"[++] predicted binding affinity (kcal.mol-1): {self.ba_val:8.1f}\n")
lines.append(f"[++] predicted dissociation constant (M) at {self.temp:.1f}˚C: {self.kd_val:8.1e}\n")
handle.writelines(lines)
if handle is not sys.stdout:
handle.close()
def print_contacts(self, outfile: str = "") -> None:
handle: Union[TextIOWrapper, TextIO]
if outfile:
handle = open(outfile, "w")
else:
handle = sys.stdout
for res1, res2 in self.ic_network:
_fmt_str = (
"{0.resname:>5s} {0.id[1]:5} {0.parent.id:>3s} {1.resname:>5s}"
" {1.id[1]:5} {1.parent.id:>3s}\n"
)
if res1.parent.id not in self.selection[0]:
res1, res2 = res2, res1
handle.write(_fmt_str.format(res1, res2))
if handle is not sys.stdout:
handle.close()
def print_pymol_script(self, outfile: str = "") -> None:
# Writing output PYMOL: pml script
# initialize array with chains and save chain selection string
selection_strings = []
chains: dict[str, set] = {}
for s in self.selection:
selection_strings.append(s.replace(",", "+"))
for c in s.split(","):
chains[c] = set()
# loop over pairs and add interface residues to respective chains
for pair in self.ic_network:
for r in pair:
chains[r.parent.id].add(str(r.id[1]))
# set output stream
handle = open(outfile, "w") if outfile else sys.stdout
# write default setup strings
handle.writelines(
[
"color silver\n",
"as cartoon\n",
"bg_color white\n",
"center\n",
"color lightblue, chain {}\n".format(selection_strings[0]),
"color lightpink, chain {}\n".format(selection_strings[1]),
]
)
# loop over interfaces construct selection strings
# and write interface related commands
for color, iface in [("blue", 1), ("hotpink", 2)]:
p_sel_string = " or ".join(
[
"chain {} and resi {}".format(c, "+".join(chains[c]))
for c in selection_strings[iface - 1].split("+")
]
)
handle.write("select iface{}, {}\n".format(iface, p_sel_string))
handle.write("color {}, iface{}\n".format(color, iface))
handle.write("show sticks, iface{}\n".format(iface))
# close file handle if applicable
if handle is not sys.stdout:
handle.close()

View File

@@ -0,0 +1,25 @@
"""
Assorted utility functions.
"""
import math
import os
def check_path(path: str) -> str:
"""
Checks if a file is readable.
"""
full_path = os.path.abspath(path)
if not os.path.isfile(full_path):
raise IOError("Could not read file: {0}".format(path))
return full_path
def dg_to_kd(dg: float, temperature: float = 25.0) -> float:
"""Coversion of DG into the dissociation constant kd"""
temp_in_k = temperature + 273.15
rt = 0.0019858775 * temp_in_k
return math.exp(dg / rt)

3
tests/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from pathlib import Path
TEST_DATA = Path(Path(__file__).parents[0], "test_data")

3020
tests/test_data/2oob.cif Normal file

File diff suppressed because it is too large Load Diff

1449
tests/test_data/2oob.pdb Normal file

File diff suppressed because it is too large Load Diff

1460
tests/test_data/dataset.json Normal file

File diff suppressed because it is too large Load Diff

78
tests/test_parsers.py Normal file
View File

@@ -0,0 +1,78 @@
from pathlib import Path
import pytest
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Structure import Structure
from prodigy_prot.modules.parsers import get_parser, parse_structure, validate_structure
from . import TEST_DATA
@pytest.fixture
def input_structure_cif():
yield Path(TEST_DATA, "2oob.cif")
@pytest.fixture
def input_structure_pdb() -> Path:
return Path(TEST_DATA, "2oob.pdb")
def test_get_parser_pdb(input_structure_pdb):
parser = get_parser(input_structure_pdb)
assert isinstance(parser, PDBParser)
def test_get_parser_cif(input_structure_cif):
parser = get_parser(input_structure_cif)
assert isinstance(parser, MMCIFParser)
def test_validate_structure_pdb(input_structure_pdb):
parser = PDBParser()
structure = parser.get_structure("test_structure", input_structure_pdb)
assert isinstance(structure, Structure)
result = validate_structure(structure)
assert result == structure.child_list
def test_validate_structure_cif(input_structure_cif):
parser = MMCIFParser()
structure = parser.get_structure("test_structure", input_structure_cif)
assert isinstance(structure, Structure)
result = validate_structure(structure)
assert result == structure.child_list
def test_parse_structure_pdb(input_structure_pdb):
parser = PDBParser()
structure = parser.get_structure(input_structure_pdb.stem, input_structure_pdb)
assert isinstance(structure, Structure)
result, num_chains, num_res = parse_structure(input_structure_pdb)
assert result == structure.child_list
assert num_chains == 2
assert num_res == 116
def test_parse_structure_cif(input_structure_cif):
parser = MMCIFParser()
structure = parser.get_structure(input_structure_cif.stem, input_structure_cif)
assert isinstance(structure, Structure)
result, num_chains, num_res = parse_structure(input_structure_cif)
assert result == structure.child_list
assert num_chains == 2
assert num_res == 116

239
tests/test_prodigy.py Normal file
View File

@@ -0,0 +1,239 @@
import json
import tarfile
import tempfile
from io import BufferedReader, TextIOWrapper
from os.path import basename, splitext
from pathlib import Path
import pytest
from Bio.PDB.Model import Model
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Residue import Residue
from Bio.PDB.Structure import Structure
from prodigy_prot.modules.parsers import validate_structure
from prodigy_prot.modules.prodigy import (
Prodigy,
analyse_contacts,
analyse_nis,
calculate_ic,
)
from . import TEST_DATA
@pytest.fixture
def input_model():
input_f = Path(TEST_DATA, "2oob.pdb")
parser = PDBParser()
structure = parser.get_structure(input_f.stem, input_f)
assert isinstance(structure, Structure)
return structure.child_list[0]
@pytest.fixture
def compressed_dataset_f():
return Path(TEST_DATA, "dataset.tgz")
@pytest.fixture
def expected_dataset_json():
return Path(TEST_DATA, "dataset.json")
@pytest.fixture
def prodigy_class(input_model):
yield Prodigy(input_model)
def test_calculate_ic(input_model):
result = calculate_ic(model=input_model, d_cutoff=5.5)
assert len(result) == 78
first_hit: tuple[Residue, Residue] = result[0]
assert first_hit[0].get_resname() == "ASN"
assert first_hit[1].get_resname() == "LYS"
def test_calculate_ic_with_selection(input_model):
result = calculate_ic(model=input_model, d_cutoff=5.5, selection={"A": 0, "B": 1})
assert len(result) == 78
first_hit: tuple[Residue, Residue] = result[0]
assert first_hit[0].get_resname() == "ASN"
assert first_hit[1].get_resname() == "LYS"
def test_analyse_contacts(input_model):
res_a = input_model["A"][(" ", 931, " ")]
res_b = input_model["B"][(" ", 6, " ")]
contact = (res_a, res_b)
test_input = [contact]
result = analyse_contacts(test_input)
expected_output = {
"AA": 0.0,
"PP": 0.0,
"CC": 0.0,
"AP": 0.0,
"CP": 1.0,
"AC": 0.0,
"LL": 1.0,
"BL": 0.0,
"BB": 0.0
}
assert result == expected_output
def test_analyse_nis():
test_input = {("B", "ARG", "72"): 0.9}
apolar, polar, charged = analyse_nis(test_input)
assert apolar == 0.0
assert polar == 100.0
assert charged == 0.0
def test_prodigy_predict(prodigy_class):
prodigy_class.predict()
assert prodigy_class.nis_a == pytest.approx(35.5, abs=1.0)
assert prodigy_class.nis_c == pytest.approx(38.0, abs=1.0)
assert prodigy_class.ba_val == pytest.approx(-6.2, abs=1.0)
# This is the actual prediction
assert prodigy_class.kd_val == pytest.approx(2.7e-5, abs=1e-6)
def test_prodigy_as_dict(prodigy_class):
result = prodigy_class.as_dict()
assert isinstance(result, dict)
# 14 'original' + 3 hydro + 1 %NIS
assert len(result) == 18
def test_prodigy_print_prediction(prodigy_class):
outfile = tempfile.NamedTemporaryFile(delete=False)
assert Path(outfile.name).stat().st_size == 0
prodigy_class.print_prediction(outfile.name)
assert Path(outfile.name).stat().st_size != 0
Path(outfile.name).unlink()
def test_prodigy_print_prediction_quiet(prodigy_class):
outfile = tempfile.NamedTemporaryFile(delete=False)
assert Path(outfile.name).stat().st_size == 0
prodigy_class.print_prediction(outfile.name, True)
assert Path(outfile.name).stat().st_size != 0
Path(outfile.name).unlink()
def test_prodigy_print_contacts(input_model, prodigy_class):
res_a = input_model["A"][(" ", 931, " ")]
res_b = input_model["B"][(" ", 6, " ")]
prodigy_class.ic_network = [(res_a, res_b)]
outfile = tempfile.NamedTemporaryFile(delete=False)
assert Path(outfile.name).stat().st_size == 0
prodigy_class.print_contacts(outfile.name)
assert Path(outfile.name).stat().st_size != 0
Path(outfile.name).unlink()
def test_print_pymol_script(input_model, prodigy_class):
res_a = input_model["A"][(" ", 931, " ")]
res_b = input_model["B"][(" ", 6, " ")]
prodigy_class.ic_network = [(res_a, res_b)]
outfile = tempfile.NamedTemporaryFile(delete=False)
assert Path(outfile.name).stat().st_size == 0
prodigy_class.print_pymol_script(outfile.name)
assert Path(outfile.name).stat().st_size != 0
Path(outfile.name).unlink()
@pytest.mark.integration
def test_dataset_prediction(compressed_dataset_f, expected_dataset_json):
"""
Test method to compare prediction for 80 dataset cases with
expected values.
"""
# load expected data from json
with open(expected_dataset_json) as fh:
expected_data = json.load(fh)
# load dataset PDBs
dataset = tarfile.open(compressed_dataset_f)
parser = PDBParser(QUIET=True)
keys_equal = ["AA", "PP", "CC", "AP", "CP", "AC"]
diffs = {"ba_val": [], "nis_a": [], "nis_c": []}
# run prodigy for each dataset in the PDB
for entry in dataset:
s_name, s_ext = splitext(basename(entry.name))
# skip system files in archive
if not s_name.isalnum() or s_ext != ".pdb":
continue
handle = dataset.extractfile(entry)
# Wrap filehandle to ensure string file handle in Python 3
handle = TextIOWrapper(BufferedReader(handle)) # type: ignore
parsed_structure = parser.get_structure(s_name, handle)
assert isinstance(parsed_structure, Structure)
models = validate_structure(parsed_structure, selection=["A", "B"])
# Test for structure object
# Check if it's a list and all elements are Model objects
assert isinstance(models, list) and all(
isinstance(item, Model) for item in models
)
# assert isinstance(s, list[Model])
# run prediction and retrieve result dict
for m in models:
prod = Prodigy(m, selection=["A", "B"])
prod.predict()
results = prod.as_dict()
# check for equality of prdicted interface residues
for k in keys_equal:
observed_value = results[k]
expected_value = expected_data[s_name][k]
assert observed_value == pytest.approx(expected_value)
# check that NIS and binding afinity values are within 2% of
# expected values and add diffs for summary
for k in diffs.keys():
delta = abs(results[k] / expected_data[s_name][k] - 1)
# assume a difference of less then 2%
assert delta == pytest.approx(0, abs=0.02)
diffs[k].append(delta)

21
tests/test_utils.py Normal file
View File

@@ -0,0 +1,21 @@
import math
import tempfile
from pathlib import Path
from prodigy_prot.modules.utils import check_path, dg_to_kd
def test_check_path():
temp_f = tempfile.NamedTemporaryFile(delete=False)
result = check_path(temp_f.name)
assert result == temp_f.name
Path(temp_f.name).unlink()
def test_dg_to_kd():
assert math.isclose(dg_to_kd(0.0), 1.0, rel_tol=1e-9)