Configure PRODIGY pipeline for WES execution with S3 and Harbor

2026-03-17 16:38:16 +01:00
commit 19fd443501
38 changed files with 16328 additions and 0 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,49 @@
+name: ci
+
+on: push
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - run: pip install '.[dev]'
+
+      - name: check types
+        run: mypy .
+
+      - name: run unittests
+        run: >-
+          pytest 
+          -m "not integration"
+          --cov
+          --cov-report xml:coverage.xml
+          --cov-append
+          -vv
+          --hypothesis-show-statistics
+
+      - name: run integration tests
+        run: >-
+          pytest 
+          -m integration
+          --cov
+          --cov-report xml:coverage.xml
+          --cov-append
+          -vv
+          --hypothesis-show-statistics
+
+      - name: Run codacy-coverage-reporter
+        uses: codacy/codacy-coverage-reporter-action@v1
+        with:
+          project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
+          coverage-reports: coverage.xml
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,48 @@
+#
+name: Create and publish a Docker image
+
+on:
+  push:
+    # run only against tags
+    tags:
+      - "*"
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,34 @@
+name: publish to pypi
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  pypi_release:
+    name: builds and publishes to pypi
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/prodigy-prot
+    permissions:
+      id-token: write
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade build
+
+      - name: build
+        run: |
+          python -m build
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,27 @@
+name: "Close stale issues and PRs"
+on:
+  schedule:
+    - cron: "30 1 * * *"
+  workflow_dispatch:
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+      actions: write
+    steps:
+      - uses: actions/stale@v10
+        with:
+          stale-pr-message: "This PR is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
+          stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
+          close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
+          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
+          days-before-stale: 30
+          days-before-close: 5
+          exempt-issue-labels: "bug"
+          exempt-pr-labels: "bug"
+          remove-stale-when-updated: true
+          operations-per-run: 100
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,13 @@
+work/
+.nextflow/
+.nextflow.log*
+*.log.*
+results/
+__pycache__/
+*.pyc
+.docker/
+.vscode/
+.idea/
+*.tmp
+*.swp
+tests/test_data/dataset.tgz
--- a/.howfairis.yml
+++ b/.howfairis.yml
@@ -0,0 +1,9 @@
+## Uncomment a line if you want to skip a given category of checks
+
+#skip_repository_checks_reason: <reason for skipping goes here>
+#skip_license_checks_reason: <reason for skipping goes here>
+#skip_registry_checks_reason: <reason for skipping goes here>
+#skip_citation_checks_reason: <reason for skipping goes here>
+skip_checklist_checks_reason: "I'm using the Codacy dashboard to guide my development"
+
+ignore_commented_badges: false
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,47 @@
+# This CITATION.cff file was generated with cffinit.
+# Visit https://bit.ly/cffinit to generate yours today!
+
+cff-version: 1.2.0
+title: Prodigy
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Anna
+    family-names: Vangone
+    affiliation: Utrecht University
+  - given-names: Alexandre
+    name-particle: MJJ
+    family-names: Bonvin
+    affiliation: Utrecht University
+  - given-names: Joerg
+    family-names: Schaarschmidt
+    affiliation: Utrecht University
+  - given-names: Rodrigo
+    family-names: Vargas Honorato
+    affiliation: Utrecht University
+  - given-names: Brian
+    family-names: Jimenez
+    affiliation: Utrecht University
+  - given-names: Joao
+    family-names: Rodrigues
+    affiliation: Utrecht University
+identifiers:
+  - type: doi
+    value: 10.1093/bioinformatics/btw514
+    description: DOI of the web service version
+  - type: doi
+    value: 10.7554/eLife.07454
+  - type: doi
+    value: 10.1016/j.jmb.2014.04.017
+repository-code: 'https://github.com/haddocking/prodigy'
+url: 'https://wenmr.science.uu.nl/prodigy'
+abstract: >-
+  A tool to predict binding affinity values for
+  protein-protein complexes from atomic structures.
+keywords:
+  - binding affinity
+  - computational biology
+  - protein-protein
+license: Apache-2.0
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,132 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+- Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+- The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+`prodigy.bonvinlab@gmail.com`.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,17 @@
+# Contributing with PRODIGY
+
+## Reporting issues
+
+If you find a bug or have a feature request, please report it in the [issue tracker](https://github.com/haddocking/prodigy/issues)
+
+## Contributing code
+
+We welcome contributions to PRODIGY. If you would like to contribute, please fork the repository and make a pull request.
+
+## Development conventions
+
+Please refer to the [development guidelines](DEVELOPMENT.md) for more details.
+
+## Contact
+
+If you have any questions, please contact us at [ask.bioexcel.eu](https://ask.bioexcel.eu)
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -0,0 +1,36 @@
+# PRODIGY Development
+
+## Installation
+
+We use `poetry` to manage the dependencies and the virtual environment, so you need to install it first; check the [official documentation](https://python-poetry.org/docs/#installation) for more details.
+
+Clone the repository and install the dependencies:
+
+```text
+git clone https://github.com/haddocking/prodigy.git && cd prodigy
+poetry install
+```
+
+## Testing
+
+To run the tests, use the following command:
+
+```text
+python -m unittest
+```
+
+## Code style
+
+We use `trunk` as the "all-purpose" linting tool, check its [documentation](https://docs.trunk.io/docs/install).
+
+To check for code style issues, run:
+
+```text
+trunk check
+```
+
+To automatically fix the issues, run:
+
+```text
+trunk fmt
+```
--- a/40
+++ b/40
@@ -0,0 +1,40 @@
+FROM python:3.12
+
+LABEL maintainer="Omic"
+LABEL description="PRODIGY - PROtein binDIng enerGY prediction"
+LABEL version="2.4.0"
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Install system dependencies required for freesasa compilation
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    g++ \
+    make \
+    procps \
+    && rm -rf /var/lib/apt/lists/*
+
+# Upgrade pip
+RUN pip install --no-cache-dir --upgrade pip
+
+# Install PRODIGY and its dependencies
+# Dependencies: biopython>=1.80, freesasa>=2.2.1, numpy>=2
+RUN pip install --no-cache-dir \
+    "biopython>=1.80" \
+    "freesasa>=2.2.1" \
+    "numpy>=2"
+
+# Install PRODIGY
+RUN pip install --no-cache-dir prodigy-prot==2.4.0
+
+# Verify installation
+RUN prodigy --help
+
+# Set working directory
+WORKDIR /data
+
+CMD ["prodigy", "--help"]
--- a/190
+++ b/190
@@ -0,0 +1,190 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2015 Anna Vangone, Panagiotis Kastritis, Alexandre Bonvin
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.md
+include src/prodigy_prot/data/naccess.config
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,341 @@
+# PRODIGY Nextflow Pipeline
+
+A Nextflow pipeline for predicting binding affinity of protein-protein complexes using PRODIGY (PROtein binDIng enerGY prediction).
+
+## Overview
+
+PRODIGY is a contact-based method for predicting the binding affinity of protein-protein complexes from their 3D structures. This pipeline containerizes PRODIGY using Docker and orchestrates execution through Nextflow, enabling reproducible, scalable analysis of protein-protein interactions.
+
+### Key Features
+
+- **Automated binding affinity prediction** from PDB/mmCIF structures
+- **Batch processing** of multiple protein complexes
+- **Docker containerization** for reproducibility
+- **Configurable parameters** for distance cutoffs, temperature, and chain selection
+- **Optional outputs** including contact lists and PyMOL visualization scripts
+
+## Scientific Background
+
+PRODIGY predicts binding affinity by analyzing intermolecular contacts (ICs) at protein-protein interfaces. The method:
+
+1. Identifies residue-residue contacts within a distance threshold (default: 5.5 Å)
+2. Classifies contacts by residue type (charged, polar, apolar)
+3. Analyzes the non-interacting surface (NIS) composition
+4. Predicts binding free energy (ΔG) and dissociation constant (Kd)
+
+The 5.5 Å distance cutoff was optimized to capture various non-bonded interactions including salt bridges, hydrogen bonds, and hydrophobic contacts.
+
+## Requirements
+
+### Software Dependencies
+
+- [Nextflow](https://www.nextflow.io/) (≥21.04.0)
+- [Docker](https://www.docker.com/) (≥20.10) or [Singularity](https://sylabs.io/singularity/) (≥3.0)
+
+### Hardware Requirements
+
+- CPU: 1+ cores per process
+- Memory: 4 GB minimum recommended
+- Storage: ~2 GB for Docker image
+
+## Installation
+
+### 1. Clone or Download the Pipeline
+
+```bash
+# Create pipeline directory
+mkdir -p /path/to/prodigy_pipeline
+cd /path/to/prodigy_pipeline
+
+# Copy pipeline files (Dockerfile, main.nf, nextflow.config, params.json)
+```
+
+### 2. Build the Docker Image
+
+```bash
+docker build -t prodigy:latest .
+```
+
+### 3. Verify Installation
+
+```bash
+# Test Docker image
+docker run --rm prodigy:latest prodigy --help
+
+# Test Nextflow
+nextflow run main.nf --help
+```
+
+## Usage
+
+### Basic Usage
+
+```bash
+# Run on a single PDB file
+nextflow run main.nf --pdb /path/to/complex.pdb --outdir /path/to/output
+
+# Run on multiple PDB files
+nextflow run main.nf --pdb '/path/to/structures/*.pdb' --outdir /path/to/output
+```
+
+### With Custom Parameters
+
+```bash
+nextflow run main.nf \
+    --pdb '/path/to/structures/*.pdb' \
+    --outdir /path/to/output \
+    --distance_cutoff 5.5 \
+    --acc_threshold 0.05 \
+    --temperature 37.0 \
+    --contact_list true \
+    --pymol_selection true
+```
+
+### Chain Selection for Complex Interfaces
+
+For antibody-antigen complexes or multi-chain proteins:
+
+```bash
+# Contacts between chains A and B only
+nextflow run main.nf --pdb complex.pdb --selection 'A B'
+
+# Heavy (H) and Light (L) chains as one molecule vs Antigen (A)
+nextflow run main.nf --pdb antibody_antigen.pdb --selection 'H,L A'
+
+# Three-way interface calculation
+nextflow run main.nf --pdb complex.pdb --selection 'A B C'
+```
+
+### Using Singularity
+
+```bash
+nextflow run main.nf -profile singularity --pdb /path/to/complex.pdb
+```
+
+## Parameters
+
+### Required Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--pdb` | Path to input PDB/mmCIF file(s). Supports glob patterns. | `/mnt/OmicNAS/private/old/olamide/Prodigy/input/*.pdb` |
+| `--outdir` | Output directory for results | `/mnt/OmicNAS/private/old/olamide/Prodigy/output` |
+
+### Analysis Parameters
+
+| Parameter | Description | Default | Range |
+|-----------|-------------|---------|-------|
+| `--distance_cutoff` | Distance threshold (Å) for defining intermolecular contacts | `5.5` | 1.0 - 20.0 |
+| `--acc_threshold` | Relative accessibility threshold for surface residue identification | `0.05` | 0.0 - 1.0 |
+| `--temperature` | Temperature (°C) for Kd calculation | `25.0` | -273.15 - 100.0 |
+| `--selection` | Chain selection for interface calculation | `''` (all chains) | See examples |
+
+### Output Control Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--contact_list` | Generate detailed contact list file | `false` |
+| `--pymol_selection` | Generate PyMOL visualization script | `false` |
+| `--quiet` | Output only affinity values (minimal output) | `false` |
+
+## Output Files
+
+### Standard Output
+
+For each input structure `<name>.pdb`, the pipeline generates:
+
+| File | Description |
+|------|-------------|
+| `<name>_prodigy.txt` | Main results file with binding affinity prediction |
+
+### Optional Output (when enabled)
+
+| File | Description | Parameter |
+|------|-------------|-----------|
+| `<name>_contacts.txt` | List of all interface contacts | `--contact_list true` |
+| `<name>_interface.pml` | PyMOL script for interface visualization | `--pymol_selection true` |
+
+### Example Output
+
+```
+[!] Structure contains gaps:
+    E ILE16 < Fragment 0 > E ALA183
+    E TYR184 < Fragment 1 > E GLY187
+
+[+] Executing 1 task(s) in total
+##########################################
+[+] Processing structure 1ppe_model0
+[+] No. of intermolecular contacts: 86
+[+] No. of charged-charged contacts: 5.0
+[+] No. of charged-polar contacts: 10.0
+[+] No. of charged-apolar contacts: 27.0
+[+] No. of polar-polar contacts: 0.0
+[+] No. of apolar-polar contacts: 20.0
+[+] No. of apolar-apolar contacts: 24.0
+[+] Percentage of apolar NIS residues: 34.10
+[+] Percentage of charged NIS residues: 18.50
+[++] Predicted binding affinity (kcal.mol-1):    -14.7
+[++] Predicted dissociation constant (M) at 25.0˚C:  1.6e-11
+```
+
+### Output Interpretation
+
+| Metric | Description |
+|--------|-------------|
+| **Intermolecular contacts** | Total number of residue-residue contacts at interface |
+| **Contact types** | Breakdown by residue character (charged/polar/apolar) |
+| **NIS residues** | Composition of non-interacting surface |
+| **Binding affinity (ΔG)** | Predicted free energy of binding (kcal/mol). More negative = stronger binding |
+| **Dissociation constant (Kd)** | Predicted Kd at specified temperature. Lower = tighter binding |
+
+### Binding Affinity Scale
+
+| ΔG (kcal/mol) | Kd (M) | Binding Strength |
+|---------------|--------|------------------|
+| -6 to -8 | 10⁻⁵ to 10⁻⁶ | Moderate |
+| -8 to -10 | 10⁻⁶ to 10⁻⁷ | Strong |
+| -10 to -12 | 10⁻⁷ to 10⁻⁹ | Very Strong |
+| < -12 | < 10⁻⁹ | Extremely Strong |
+
+## Test Data
+
+Download example protein complexes from the RCSB PDB:
+
+```bash
+# Create input directory
+mkdir -p /mnt/OmicNAS/private/old/olamide/Prodigy/input
+
+# Download test structures
+wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/3bzd.pdb https://files.rcsb.org/download/3BZD.pdb
+wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/2oob.pdb https://files.rcsb.org/download/2OOB.pdb
+wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/1ppe.pdb https://files.rcsb.org/download/1PPE.pdb
+```
+
+### Expected Results
+
+| Structure | Description | Expected ΔG (kcal/mol) |
+|-----------|-------------|------------------------|
+| 3BZD | Protein-protein complex | -9.4 |
+| 2OOB | Protein-protein complex | -6.2 |
+| 1PPE | Trypsin-inhibitor complex | -14.7 |
+
+## Pipeline Structure
+
+```
+prodigy_pipeline/
+├── Dockerfile          # Docker image definition
+├── main.nf             # Nextflow pipeline script
+├── nextflow.config     # Pipeline configuration
+├── params.json         # Parameter documentation
+└── README.md           # This file
+```
+
+## Docker Image Details
+
+The Docker image is based on Python 3.12 and includes:
+
+- **prodigy-prot** (v2.4.0) - Main PRODIGY package
+- **biopython** (≥1.80) - PDB structure parsing
+- **freesasa** (≥2.2.1) - Solvent accessible surface area calculation
+- **numpy** (≥2) - Numerical computations
+
+### Building the Image
+
+```bash
+docker build -t prodigy:latest .
+```
+
+### Running Standalone
+
+```bash
+# Run PRODIGY directly
+docker run --rm -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
+
+# Get help
+docker run --rm prodigy:latest prodigy --help
+```
+
+## Troubleshooting
+
+### Common Issues
+
+**1. Docker Hub Rate Limit Error**
+```
+ERROR: toomanyrequests: You have reached your pull rate limit
+```
+Solution: Log in to Docker Hub with `docker login` or wait and retry.
+
+**2. Structure Contains Gaps Warning**
+```
+[!] Structure contains gaps
+```
+This is informational, not an error. PRODIGY handles missing residues automatically.
+
+**3. No Intermolecular Contacts Found**
+- Verify the structure contains multiple chains
+- Check chain selection parameters
+- Ensure chains are in contact (within distance cutoff)
+
+**4. Permission Denied Errors**
+```bash
+# Run with user permissions
+docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
+```
+
+### Getting Help
+
+```bash
+# PRODIGY help
+docker run --rm prodigy:latest prodigy --help
+
+# Nextflow pipeline help
+nextflow run main.nf --help
+```
+
+## Citation
+
+If you use this pipeline, please cite the following publications:
+
+### PRODIGY Method
+
+1. **Xue LC, Rodrigues JP, Kastritis PL, Bonvin AM, Vangone A.** (2016)
+   PRODIGY: a web server for predicting the binding affinity of protein-protein complexes.
+   *Bioinformatics*, 32(23):3676-3678.
+   [DOI: 10.1093/bioinformatics/btw514](https://doi.org/10.1093/bioinformatics/btw514)
+
+2. **Vangone A, Bonvin AM.** (2015)
+   Contacts-based prediction of binding affinity in protein-protein complexes.
+   *eLife*, 4:e07454.
+   [DOI: 10.7554/eLife.07454](https://doi.org/10.7554/eLife.07454)
+
+3. **Kastritis PL, Rodrigues JP, Folkers GE, Boelens R, Bonvin AM.** (2014)
+   Proteins feel more than they see: Fine-tuning of binding affinity by properties of the non-interacting surface.
+   *Journal of Molecular Biology*, 426(14):2632-2652.
+   [DOI: 10.1016/j.jmb.2014.04.017](https://doi.org/10.1016/j.jmb.2014.04.017)
+
+### Software Dependencies
+
+- **Nextflow**: Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. *Nature Biotechnology*, 35:316-319.
+- **Biopython**: Cock PJ, et al. (2009) Biopython: freely available Python tools for computational molecular biology and bioinformatics. *Bioinformatics*, 25(11):1422-1423.
+- **FreeSASA**: Mitternacht S. (2016) FreeSASA: An open source C library for solvent accessible surface area calculations. *F1000Research*, 5:189.
+
+## License
+
+This pipeline is distributed under the Apache License 2.0, consistent with the PRODIGY software license.
+
+## Links
+
+- **PRODIGY Web Server**: [https://wenmr.science.uu.nl/prodigy/](https://wenmr.science.uu.nl/prodigy/)
+- **PRODIGY GitHub**: [https://github.com/haddocking/prodigy](https://github.com/haddocking/prodigy)
+- **BonvinLab**: [https://www.bonvinlab.org/](https://www.bonvinlab.org/)
+- **Nextflow**: [https://www.nextflow.io/](https://www.nextflow.io/)
+
+## Support
+
+For questions about:
+- **PRODIGY method**: Contact the BonvinLab team at [ask.bioexcel.eu](https://ask.bioexcel.eu/)
+- **This pipeline**: Open an issue in the repository
+
+---
+
+*Pipeline version: 2.4.0 | Last updated: January 2026*
--- a/examples/3BZD.ic_model
+++ b/examples/3BZD.ic_model
@@ -0,0 +1,13 @@
+[+] Reading structure file: /Users/joao/software/binding_affinity/examples/3BZD.pdb
+[+] Parsed structure file 3BZD (2 chains, 343 residues)
+[+] No. of intermolecular contacts: 51
+[+] No. of charged-charged contacts: 4
+[+] No. of charged-polar contacts: 7
+[+] No. of charged-apolar contacts: 6
+[+] No. of polar-polar contacts: 7
+[+] No. of apolar-polar contacts: 15
+[+] No. of apolar-apolar contacts: 12
+[+] Percentage of apolar NIS residues: 29.48
+[+] Percentage of charged NIS residues: 29.48
+[++] Predicted binding affinity (kcal.mol-1):   -9.373
+[++] Predicted dissociation constant (M): 1.333e-07
--- a/examples/3BZD.pdb
+++ b/examples/3BZD.pdb
--- a/examples/3bzd.cif
+++ b/examples/3bzd.cif
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,74 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Default parameters
+params.pdb = 's3://omic/eureka/prodigy/input/*.pdb'
+params.outdir = 's3://omic/eureka/prodigy/output'
+params.distance_cutoff = 5.5
+params.acc_threshold = 0.05
+params.temperature = 25.0
+params.selection = ''
+params.contact_list = false
+params.pymol_selection = false
+params.quiet = false
+
+// =============================================================================
+// Process: PRODIGY
+// Predicts binding affinity using intermolecular contacts
+// =============================================================================
+
+process PRODIGY {
+    container 'harbor.cluster.omic.ai/omic/prodigy:latest'
+    publishDir params.outdir, mode: 'copy'
+    stageInMode 'copy'
+
+    input:
+        path pdb
+
+    output:
+        path "${pdb.baseName}_prodigy.txt", emit: results
+        path "${pdb.baseName}_contacts.txt", optional: true, emit: contacts
+        path "${pdb.baseName}_interface.pml", optional: true, emit: pymol
+
+    script:
+        """
+        prodigy \\
+            ${pdb} \\
+            --distance-cutoff ${params.distance_cutoff} \\
+            --acc-threshold ${params.acc_threshold} \\
+            --temperature ${params.temperature} \\
+            ${params.selection ? '--selection ' + params.selection : ''} \\
+            ${params.contact_list ? '--contact_list' : ''} \\
+            ${params.pymol_selection ? '--pymol_selection' : ''} \\
+            ${params.quiet ? '--quiet' : ''} \\
+            2>&1 | tee ${pdb.baseName}_prodigy.txt
+
+        # Rename contact list file if generated
+        if [ -f "${pdb.baseName}.contacts" ]; then
+            mv ${pdb.baseName}.contacts ${pdb.baseName}_contacts.txt
+        fi
+
+        # Rename PyMOL script if generated
+        if [ -f "${pdb.baseName}.pml" ]; then
+            mv ${pdb.baseName}.pml ${pdb.baseName}_interface.pml
+        fi
+        """
+}
+
+// =============================================================================
+// Workflow
+// =============================================================================
+
+workflow {
+    // Validate input
+    if (!params.pdb) {
+        error "ERROR: Please provide input PDB file(s) using --pdb parameter"
+    }
+
+    // Create input channel
+    pdb_ch = Channel.fromPath(params.pdb, checkIfExists: true)
+
+    // Run PRODIGY
+    PRODIGY(pdb_ch)
+}
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,71 @@
+// =============================================================================
+// PRODIGY Nextflow Pipeline Configuration
+// Protein binding affinity prediction from structural data
+// =============================================================================
+
+// Manifest for Nextflow metadata
+manifest {
+    name = 'PRODIGY-Nextflow'
+    author = 'Olamide'
+    homePage = 'https://trs-gitea.cluster.omic.ai/omic/prodigy'
+    description = 'Nextflow pipeline for PRODIGY - Protein binding affinity prediction based on intermolecular contacts'
+    mainScript = 'main.nf'
+    version = '2.4.0'
+}
+
+// Global default parameters
+params {
+    pdb = 's3://omic/eureka/prodigy/input/*.pdb'
+    outdir = 's3://omic/eureka/prodigy/output'
+    distance_cutoff = 5.5
+    acc_threshold = 0.05
+    temperature = 25.0
+    selection = ''
+    contact_list = false
+    pymol_selection = false
+    quiet = false
+}
+
+// Container configurations
+docker {
+    enabled = true
+    runOptions = '-u $(id -u):$(id -g)'
+}
+
+// Process configurations
+process {
+    cpus = 1
+    memory = '4 GB'
+    container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
+}
+
+// Execution configurations
+executor {
+    $local {
+        cpus = 4
+        memory = '8 GB'
+    }
+}
+
+// Profiles for different execution environments
+profiles {
+    standard {
+        docker.enabled = true
+    }
+
+    k8s {
+        docker.enabled = true
+        process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
+    }
+
+    k8s_gpu {
+        docker.enabled = true
+        process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
+    }
+
+    singularity {
+        singularity.enabled = true
+        singularity.autoMounts = true
+        docker.enabled = false
+    }
+}
--- a/params.json
+++ b/params.json
@@ -0,0 +1,157 @@
+{
+    "params": {
+        "pdb": {
+            "type": "file",
+            "description": "Path to input PDB or mmCIF structure file(s) for binding affinity prediction",
+            "default": "s3://omic/eureka/prodigy/input/*.pdb",
+            "required": true,
+            "pipeline_io": "input",
+            "var_name": "params.pdb",
+            "examples": [
+                "s3://omic/eureka/prodigy/input/3bzd.pdb",
+                "s3://omic/eureka/prodigy/input/*.pdb"
+            ],
+            "pattern": ".*\\.(pdb|cif)$",
+            "enum": [],
+            "validation": {},
+            "notes": "Input protein-protein complex structure in PDB or mmCIF format. Can be a single file or glob pattern for batch processing."
+        },
+        "outdir": {
+            "type": "folder",
+            "description": "Directory for PRODIGY prediction results",
+            "default": "s3://omic/eureka/prodigy/output",
+            "required": true,
+            "pipeline_io": "output",
+            "var_name": "params.outdir",
+            "examples": [
+                "s3://omic/eureka/prodigy/output",
+                "s3://omic/eureka/prodigy/custom_output"
+            ],
+            "pattern": ".*",
+            "enum": [],
+            "validation": {},
+            "notes": "Directory where prediction results will be stored. Created if it does not exist."
+        },
+        "distance_cutoff": {
+            "type": "float",
+            "description": "Distance cutoff (Angstrom) for calculating intermolecular contacts",
+            "default": 5.5,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.distance_cutoff",
+            "examples": [
+                5.5,
+                4.0,
+                6.0
+            ],
+            "pattern": null,
+            "enum": [],
+            "validation": {
+                "min": 1.0,
+                "max": 20.0
+            },
+            "notes": "Default value of 5.5 Angstrom was optimized in Vangone & Bonvin (2015) eLife. This threshold includes different non-bonded interactions including salt bridges."
+        },
+        "acc_threshold": {
+            "type": "float",
+            "description": "Accessibility threshold for buried surface area (BSA) analysis",
+            "default": 0.05,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.acc_threshold",
+            "examples": [
+                0.05,
+                0.1
+            ],
+            "pattern": null,
+            "enum": [],
+            "validation": {
+                "min": 0.0,
+                "max": 1.0
+            },
+            "notes": "Relative accessibility threshold used to identify surface residues for non-interacting surface (NIS) calculations."
+        },
+        "temperature": {
+            "type": "float",
+            "description": "Temperature (Celsius) for dissociation constant (Kd) prediction",
+            "default": 25.0,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.temperature",
+            "examples": [
+                25.0,
+                37.0,
+                4.0
+            ],
+            "pattern": null,
+            "enum": [],
+            "validation": {
+                "min": -273.15,
+                "max": 100.0
+            },
+            "notes": "Temperature used to convert predicted binding free energy (deltaG) to dissociation constant (Kd)."
+        },
+        "selection": {
+            "type": "string",
+            "description": "Chain selection for interface calculation",
+            "default": "",
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.selection",
+            "examples": [
+                "A B",
+                "A,B C",
+                "H,L A"
+            ],
+            "pattern": null,
+            "enum": [],
+            "validation": {},
+            "notes": "Specify chains to consider for binding affinity calculation. Format: 'A B' calculates contacts between chains A and B. 'A,B C' treats chains A and B as one molecule interacting with chain C. Useful for antibody-antigen complexes where heavy and light chains should be grouped."
+        },
+        "contact_list": {
+            "type": "boolean",
+            "description": "Output list of intermolecular contacts",
+            "default": false,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.contact_list",
+            "examples": [
+                true,
+                false
+            ],
+            "enum": [true, false],
+            "validation": {},
+            "notes": "When enabled, outputs a detailed list of all residue-residue contacts at the interface."
+        },
+        "pymol_selection": {
+            "type": "boolean",
+            "description": "Output PyMOL script to visualize interface",
+            "default": false,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.pymol_selection",
+            "examples": [
+                true,
+                false
+            ],
+            "enum": [true, false],
+            "validation": {},
+            "notes": "When enabled, generates a PyMOL script (.pml) to highlight interface residues for visualization."
+        },
+        "quiet": {
+            "type": "boolean",
+            "description": "Output only predicted affinity values",
+            "default": false,
+            "required": false,
+            "pipeline_io": "parameter",
+            "var_name": "params.quiet",
+            "examples": [
+                true,
+                false
+            ],
+            "enum": [true, false],
+            "validation": {},
+            "notes": "When enabled, outputs only the predicted binding affinity value without detailed analysis. Useful for batch processing and downstream parsing."
+        }
+    }
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,45 @@
+[project]
+name = "prodigy-prot"
+license = "Apache-2.0"
+version = "2.4.0"
+description = "PROtein binDIng enerGY prediction"
+authors = [
+  { name = "Anna Vangone" },
+  { name = "Joao Rodrigues" },
+  { name = "Joerg Schaarschmidt" },
+]
+maintainers = [{ name = "BonvinLab", email = "bonvinlab.support@uu.nl" }]
+readme = "README.md"
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Topic :: Scientific/Engineering :: Chemistry",
+  "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+
+dependencies = ["biopython>=1.80", "freesasa>=2.2.1", "numpy>=2"]
+
+[project.optional-dependencies]
+dev = ["pytest", "coverage", "hypothesis", "pytest-cov", "mypy"]
+
+[project.scripts]
+prodigy = "prodigy_prot.cli:main"
+
+[tool.setuptools]
+include-package-data = true
+packages = ["src"]
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+markers = ["integration: marks tests as integration tests"]
+
+[tool.mypy]
+disable_error_code = ["import-not-found"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/src/prodigy_prot/init.py
+++ b/src/prodigy_prot/init.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+NACCESS_CONFIG = Path(Path(__file__).parents[0], "data/naccess.config")
--- a/src/prodigy_prot/cli.py
+++ b/src/prodigy_prot/cli.py
@@ -0,0 +1,199 @@
+"""
+Binding affinity predictor based on Intermolecular Contacts (ICs).
+"""
+
+import argparse
+import logging
+import sys
+from argparse import RawTextHelpFormatter
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from io import StringIO
+from pathlib import Path
+
+from Bio.PDB.Model import Model
+
+from prodigy_prot.modules.parsers import parse_structure
+from prodigy_prot.modules.prodigy import Prodigy
+
+# setup logging
+logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
+log = logging.getLogger("Prodigy")
+
+
+ap = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
+ap.add_argument(
+    "input_path",
+    help="Path to either: \n- Structure in PDB or mmCIF format\n- Directory containing structure files",
+)
+ap.add_argument(
+    "--distance-cutoff",
+    type=float,
+    default=5.5,
+    help="Distance cutoff to calculate ICs",
+)
+ap.add_argument(
+    "--acc-threshold",
+    type=float,
+    default=0.05,
+    help="Accessibility threshold for BSA analysis",
+)
+ap.add_argument(
+    "--temperature",
+    type=float,
+    default=25.0,
+    help="Temperature (C) for Kd prediction",
+)
+ap.add_argument("--contact_list", action="store_true", help="Output a list of contacts")
+ap.add_argument(
+    "--pymol_selection",
+    action="store_true",
+    help="Output a script to highlight the interface (pymol)",
+)
+ap.add_argument(
+    "-q",
+    "--quiet",
+    action="store_true",
+    help="Outputs only the predicted affinity value",
+)
+ap.add_argument(
+    "-s",
+    "--showall",
+    action="store_true",
+    help="Outputs all original prodigy features but BSA (mutually exclusive with `-q`)",
+)
+ap.add_argument(
+    "-np",
+    "--number-of-processors",
+    type=int,
+    action="store",
+    help="Number of processors to use (default: 1)",
+    default=1,
+)
+_co_help = """
+By default, all intermolecular contacts are taken into consideration,
+a molecule being defined as an isolated group of amino acids sharing
+a common chain identifier. In specific cases, for example
+antibody-antigen complexes, some chains should be considered as a
+single molecule.
+
+Use the --selection option to provide collections of chains that should
+be considered for the calculation. Separate by a space the chains that
+are to be considered _different_ molecules. Use commas to include multiple
+chains as part of a single group:
+
+--selection A B => Contacts calculated (only) between chains A and B.
+--selection A,B C => Contacts calculated (only) between \
+    chains A and C; and B and C.
+--selection A B C => Contacts calculated (only) between \
+    chains A and B; B and C; and A and C.
+"""
+sel_opt = ap.add_argument_group("Selection Options", description=_co_help)
+sel_opt.add_argument("--selection", nargs="+", metavar=("A B", "A,B C"))
+
+
+def main():
+    args = ap.parse_args()
+    log.setLevel(logging.ERROR if args.quiet else logging.INFO)
+
+    if args.quiet and args.showall:
+        log.error("Error: --quiet (-q) and --showall (-s) are mutually exclusive arguments")
+        sys.exit(1)
+    log.setLevel(logging.ERROR if args.quiet else logging.INFO)
+
+    struct_path = Path(args.input_path)
+
+    input_list = []
+    if struct_path.is_file():
+        input_list.append(struct_path)
+
+    elif struct_path.is_dir():
+        for input_f in struct_path.glob("*"):
+            if Path(input_f).suffix in [".pdb", ".cif", ".ent"]:
+                input_list.append(input_f)
+
+    elif not struct_path.exists():
+        log.error(f"File {struct_path} does not exist")
+        sys.exit(1)
+
+    else:
+        log.error(f"Input path {struct_path} is neither a valid file nor a directory")
+        sys.exit(1)
+
+    # Collect all tasks
+    tasks = []
+    for input_f in input_list:
+        models, _, _ = parse_structure(str(input_f))
+        struct_path = Path(input_f)
+
+        for model in models:
+            identifier = f"{struct_path.stem}_model{model.id}"
+            tasks.append((model, identifier, args, struct_path))
+
+    # Execute in parallel
+    total_tasks = len(tasks)
+    if total_tasks == 0:
+        log.error("No valid structures found")
+        sys.exit(1)
+    max_workers = min(args.number_of_processors, total_tasks)
+    log.info(f"[+] Executing {total_tasks} task(s) in total")
+    if max_workers != args.number_of_processors:
+        log.info("[+] Adjusting number of processors based on number of tasks")
+        log.info(
+            f"[+] Using {max_workers} processor(s) instead of {args.number_of_processors}"
+        )
+
+    # Execute and collect results
+    results = []
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(process_model, *task) for task in tasks]
+        for future in as_completed(futures):
+            try:
+                result = future.result()
+                results.append(result)
+            except Exception as e:
+                log.error(f"Error processing model: {e}")
+
+    # Sort by identifier, then model.id
+    results.sort(key=lambda x: (x[0], x[1]))
+    # Print all outputs sequentially
+    for identifier, _, output in results:
+        print(output, end="")
+
+
+def process_model(model: Model, identifier: str, args: argparse.Namespace, struct_path):
+    """Process a single model"""
+    # Capture stdout
+    output_buffer = StringIO()
+
+    old_stdout = sys.stdout
+    sys.stdout = output_buffer
+    try:
+        if not args.quiet:
+            print("#" * 42)
+            print(f"[+] Processing structure {identifier}")
+        prodigy = Prodigy(
+            model=model,
+            name=identifier,
+            selection=args.selection,
+            temp=args.temperature,
+        )
+        prodigy.predict(
+            distance_cutoff=args.distance_cutoff, acc_threshold=args.acc_threshold
+        )
+        prodigy.print_prediction(quiet=args.quiet, showall=args.showall)
+    finally:
+        sys.stdout = old_stdout
+
+    if args.contact_list:
+        contact_list_f = struct_path.with_suffix(".ic")
+        prodigy.print_contacts(outfile=str(contact_list_f))
+
+    if args.pymol_selection:
+        pymol_script_f = struct_path.with_suffix(".pml")
+        prodigy.print_pymol_script(outfile=str(pymol_script_f))
+
+    return identifier, model.id, output_buffer.getvalue()
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/src/prodigy_prot/data/naccess.config
+++ b/src/prodigy_prot/data/naccess.config
@@ -0,0 +1,256 @@
+# Contributed by João Rodrigues 
+
+name: NACCESS
+
+types:
+C_ALI 1.87 apolar
+C_CAR 1.76 apolar
+C_NUC 1.80 apolar
+N_AMN 1.50 polar
+N_AMD 1.65 polar
+N_NUC 1.60 polar
+O 1.40 polar
+S 1.85 apolar
+SE 1.80 apolar
+P 1.90 apolar
+
+atoms:
+ANY C   C_CAR
+ANY O   O
+ANY CA  C_ALI
+ANY N   N_AMD
+ANY CB  C_ALI
+ANY OXT O
+# nucleic acid
+ANY P   P
+ANY OP1 O
+ANY OP2 O
+ANY OP3 O
+ANY O5' O
+ANY O4' O
+ANY O3' O
+ANY O2' O
+ANY C5' C_NUC
+ANY C4' C_NUC
+ANY C3' C_NUC
+ANY C2' C_NUC
+ANY C1' C_NUC
+
+ALA CB C_ALI # included so that RSA values will be generated
+
+ARG CG C_ALI
+ARG CD C_ALI
+ARG NE N_AMD
+ARG CZ C_CAR
+ARG NH1 N_AMD
+ARG NH2 N_AMD
+
+ASN CG  C_CAR
+ASN OD1 O
+ASN ND2 N_AMD
+
+ASP CG  C_CAR
+ASP OD1 O
+ASP OD2 O
+
+CYS SG  S
+
+GLN CG  C_ALI
+GLN CD  C_CAR
+GLN OE1 O
+GLN NE2 N_AMD
+
+GLU CG  C_ALI
+GLU CD  C_CAR
+GLU OE1 O
+GLU OE2 O
+
+GLY CA C_ALI # included so that RSA values will be generated
+
+HIS CG  C_CAR
+HIS ND1 N_AMD
+HIS CD2 C_CAR
+HIS NE2 N_AMD
+HIS CE1 C_CAR
+
+ILE CG1 C_ALI
+ILE CG2 C_ALI
+ILE CD1 C_ALI
+
+LEU CG  C_ALI
+LEU CD1 C_ALI
+LEU CD2 C_ALI
+
+LYS CG  C_ALI
+LYS CD  C_ALI
+LYS CE  C_ALI
+LYS NZ  N_AMN
+
+MET CG  C_ALI
+MET SD  S
+MET CE  C_ALI
+
+PHE CG  C_CAR
+PHE CD1 C_CAR
+PHE CD2 C_CAR
+PHE CE1 C_CAR
+PHE CE2 C_CAR
+PHE CZ  C_CAR
+
+PRO CG  C_ALI
+PRO CD  C_ALI
+
+SEC SE  SE
+
+SER OG  O
+
+THR OG1 O
+THR CG2 C_ALI
+
+TRP CG  C_CAR
+TRP CD1 C_CAR
+TRP CD2 C_CAR
+TRP NE1 N_AMD
+TRP CE2 C_CAR
+TRP CE3 C_CAR
+TRP CZ2 C_CAR
+TRP CZ3 C_CAR
+TRP CH2 C_CAR
+
+TYR CG  C_CAR
+TYR CD1 C_CAR
+TYR CD2 C_CAR
+TYR CE1 C_CAR
+TYR CE2 C_CAR
+TYR CZ  C_CAR
+TYR OH  O
+
+VAL CG1 C_ALI
+VAL CG2 C_ALI
+
+A N9 N_NUC
+A C8 C_NUC
+A N7 N_NUC
+A C5 C_NUC
+A C6 C_NUC
+A N6 N_NUC
+A N1 N_NUC
+A C2 C_NUC
+A N3 N_NUC
+A C4 C_NUC
+
+C N1 N_NUC
+C C2 C_NUC
+C O2 O
+C N3 N_NUC
+C C4 C_NUC
+C N4 N_NUC
+C C5 C_NUC
+C C6 C_NUC
+
+G N9 N_NUC
+G C8 C_NUC
+G N7 N_NUC
+G C5 C_NUC
+G C6 C_NUC
+G O6 O
+G N1 N_NUC
+G C2 C_NUC
+G N2 N_NUC
+G N3 N_NUC
+G C4 C_NUC
+
+I N9 N_NUC
+I C8 C_NUC
+I N7 N_NUC
+I C5 C_NUC
+I C6 C_NUC
+I O6 O
+I N1 N_NUC
+I C2 C_NUC
+I N3 N_NUC
+I C4 C_NUC
+
+T N1 N_NUC
+T C2 C_NUC
+T O2 O
+T N3 N_NUC
+T C4 C_NUC
+T O4 O
+T C5 C_NUC
+T C7 C_NUC
+T C6 C_NUC
+
+U N1 N_NUC
+U C2 C_NUC
+U O2 O
+U N3 N_NUC
+U C4 C_NUC
+U O4 O
+U C5 C_NUC
+U C6 C_NUC
+
+DA N9 N_NUC
+DA C8 C_NUC
+DA N7 N_NUC
+DA C5 C_NUC
+DA C6 C_NUC
+DA N6 N_NUC
+DA N1 N_NUC
+DA C2 C_NUC
+DA N3 N_NUC
+DA C4 C_NUC
+
+DC N1 N_NUC
+DC C2 C_NUC
+DC O2 O
+DC N3 N_NUC
+DC C4 C_NUC
+DC N4 N_NUC
+DC C5 C_NUC
+DC C6 C_NUC
+
+DG N9 N_NUC
+DG C8 C_NUC
+DG N7 N_NUC
+DG C5 C_NUC
+DG C6 C_NUC
+DG O6 O
+DG N1 N_NUC
+DG C2 C_NUC
+DG N2 N_NUC
+DG N3 N_NUC
+DG C4 C_NUC
+
+DI N9 N_NUC
+DI C8 C_NUC
+DI N7 N_NUC
+DI C5 C_NUC
+DI C6 C_NUC
+DI O6 O
+DI N1 N_NUC
+DI C2 C_NUC
+DI N3 N_NUC
+DI C4 C_NUC
+
+DT N1 N_NUC
+DT C2 C_NUC
+DT O2 O
+DT N3 N_NUC
+DT C4 C_NUC
+DT O4 O
+DT C5 C_NUC
+DT C7 C_NUC
+DT C6 C_NUC
+
+DU N1 N_NUC
+DU C2 C_NUC
+DU O2 O
+DU N3 N_NUC
+DU C4 C_NUC
+DU O4 O
+DU C5 C_NUC
+DU C6 C_NUC
+
+
+
--- a/src/prodigy_prot/modules/init.py
+++ b/src/prodigy_prot/modules/init.py
--- a/src/prodigy_prot/modules/aa_properties.py
+++ b/src/prodigy_prot/modules/aa_properties.py
@@ -0,0 +1,148 @@
+"""
+Generic properties of amino acids required for the binding affinity
+prediction methods.
+"""
+
+aa_character_ic: dict[str, str] = {
+    "ALA": "A",
+    "CYS": "A",  # ?
+    "GLU": "C",
+    "ASP": "C",
+    "GLY": "A",
+    "PHE": "A",
+    "ILE": "A",
+    "HIS": "C",
+    "LYS": "C",
+    "MET": "A",
+    "LEU": "A",
+    "ASN": "P",
+    "GLN": "P",
+    "PRO": "A",
+    "SER": "P",
+    "ARG": "C",
+    "THR": "P",
+    "TRP": "A",
+    "VAL": "A",
+    "TYR": "A",
+}
+
+aa_character_protorp: dict[str, str] = {
+    "ALA": "A",
+    "CYS": "P",
+    "GLU": "C",
+    "ASP": "C",
+    "GLY": "A",
+    "PHE": "A",
+    "ILE": "A",
+    "HIS": "P",
+    "LYS": "C",
+    "MET": "A",
+    "LEU": "A",
+    "ASN": "P",
+    "GLN": "P",
+    "PRO": "A",
+    "SER": "P",
+    "ARG": "C",
+    "THR": "P",
+    "TRP": "P",
+    "VAL": "A",
+    "TYR": "P",
+}
+
+# Taken from pre-original prodigy code
+# B for hydrophoBic
+# Y for hydrophiLic
+aa_character_hydro: dict[str, str] = {
+    "ALA": "B", #+
+    "CYS": "B", #+
+    "GLU": "L", #+
+    "ASP": "L", #+
+    "GLY": "L", # Glycine was B in my initial classification
+    "PHE": "B", #+
+    "ILE": "B", #+
+    "HIS": "L", #+
+    "LYS": "L", #+
+    "MET": "B", #+
+    "LEU": "B", #+
+    "ASN": "L", #+
+    "GLN": "L", #+
+    "PRO": "L", # Proline was B my initial classification
+    "SER": "L", #+
+    "ARG": "L", #+
+    "THR": "L", #+
+    "TRP": "L", #+
+    "VAL": "B", #+
+    "TYR": "L", #+
+}
+
+# Scaling factors for relative ASA
+# Calculated using extended ALA-X-ALA peptides
+# Taken from NACCESS
+rel_asa: dict[str, dict[str, float]] = {
+    "total": {
+        "ALA": 107.95,
+        "CYS": 134.28,
+        "ASP": 140.39,
+        "GLU": 172.25,
+        "PHE": 199.48,
+        "GLY": 80.10,
+        "HIS": 182.88,
+        "ILE": 175.12,
+        "LYS": 200.81,
+        "LEU": 178.63,
+        "MET": 194.15,
+        "ASN": 143.94,
+        "PRO": 136.13,
+        "GLN": 178.50,
+        "ARG": 238.76,
+        "SER": 116.50,
+        "THR": 139.27,
+        "VAL": 151.44,
+        "TRP": 249.36,
+        "TYR": 212.76,
+    },
+    "bb": {
+        "ALA": 38.54,
+        "CYS": 37.53,
+        "ASP": 37.70,
+        "GLU": 37.51,
+        "PHE": 35.37,
+        "GLY": 47.77,
+        "HIS": 35.80,
+        "ILE": 37.16,
+        "LYS": 37.51,
+        "LEU": 37.51,
+        "MET": 37.51,
+        "ASN": 37.70,
+        "PRO": 16.23,
+        "GLN": 37.51,
+        "ARG": 37.51,
+        "SER": 38.40,
+        "THR": 37.57,
+        "VAL": 37.16,
+        "TRP": 38.10,
+        "TYR": 35.38,
+    },
+    "sc": {
+        "ALA": 69.41,
+        "CYS": 96.75,
+        "ASP": 102.69,
+        "GLU": 134.74,
+        "PHE": 164.11,
+        "GLY": 32.33,
+        "HIS": 147.08,
+        "ILE": 137.96,
+        "LYS": 163.30,
+        "LEU": 141.12,
+        "MET": 156.64,
+        "ASN": 106.24,
+        "PRO": 119.90,
+        "GLN": 140.99,
+        "ARG": 201.25,
+        "SER": 78.11,
+        "THR": 101.70,
+        "VAL": 114.28,
+        "TRP": 211.26,
+        "TYR": 177.38,
+    },
+}
--- a/src/prodigy_prot/modules/freesasa_tools.py
+++ b/src/prodigy_prot/modules/freesasa_tools.py
@@ -0,0 +1,71 @@
+"""
+Functions to execute freesasa and parse its output.
+"""
+
+import os
+
+import freesasa
+from Bio.PDB.Model import Model
+from Bio.PDB.Structure import Structure
+from freesasa import Classifier, calc, structureFromBioPDB
+
+from prodigy_prot import NACCESS_CONFIG
+from prodigy_prot.modules.aa_properties import rel_asa
+
+freesasa.setVerbosity(freesasa.nowarnings)
+
+
+def execute_freesasa_api(model: Model) -> tuple[dict, dict]:
+    """
+    Calls freesasa using its Python API and returns
+    per-residue accessibilities.
+    """
+
+    asa_data = {}
+    rsa_data: dict[tuple[str, int, str], float] = {}
+    _rsa: dict = rel_asa["total"]
+
+    classifier = Classifier(str(NACCESS_CONFIG))
+
+    # NOTE: `structureFromBioPDB` requires a Structure object
+    #  so here build one from a model
+    s = Structure(model.id)
+    s.add(model)
+
+    try:
+        struct = structureFromBioPDB(
+            s,
+            classifier,
+        )
+        result = calc(struct)
+    except AssertionError as e:
+        error_message = "" + os.linesep
+        error_message += "[!] Error when running freesasa:" + os.linesep
+        error_message += f"[!] {e}" + os.linesep
+        error_message += (
+            "[!] Make sure the atom names in your PDB file match"
+            " the canonical naming and belong "
+            "to default residues" + os.linesep
+        )
+        print(error_message)
+        raise Exception(error_message)
+
+    # iterate over all atoms to get SASA and residue name
+    for idx in range(struct.nAtoms()):
+        atname = struct.atomName(idx)
+        resname = struct.residueName(idx)
+        resid = struct.residueNumber(idx)
+        chain = struct.chainLabel(idx)
+        at_uid = (chain, resname, resid, atname)
+        res_uid = (chain, resname, resid)
+
+        asa = result.atomArea(idx)
+        asa_data[at_uid] = asa
+        # add asa to residue
+        rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa
+
+    # convert total asa ro relative asa
+    rsa_data.update(
+        (res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items()
+    )
+    return asa_data, rsa_data
--- a/src/prodigy_prot/modules/models.py
+++ b/src/prodigy_prot/modules/models.py
@@ -0,0 +1,41 @@
+"""
+Models to predict binding affinity based on molecular properties.
+"""
+
+
+def IC_NIS(
+    ic_cc: float,
+    ic_ca: float,
+    ic_pp: float,
+    ic_pa: float,
+    p_nis_a: float,
+    p_nis_c: float,
+) -> float:
+    """
+    Calculates the predicted binding affinity value
+    based on the IC-NIS model.
+    """
+
+    return (
+        -0.09459 * ic_cc
+        + -0.10007 * ic_ca
+        + 0.19577 * ic_pp
+        + -0.22671 * ic_pa
+        + 0.18681 * p_nis_a
+        + 0.13810 * p_nis_c
+        + -15.9433
+    )
+
+
+def NIS(p_nis_c: float, p_nis_p: float, n_int_atoms: float) -> float:
+    """
+    Calculates the predicted binding affinity value
+    based on the NIS model.
+    """
+
+    return (
+        0.0856851248873 * p_nis_p
+        + -0.0685254498746 * p_nis_c
+        + 0.0261591389985 * n_int_atoms
+        + 3.0124939659498
+    )
--- a/src/prodigy_prot/modules/parsers.py
+++ b/src/prodigy_prot/modules/parsers.py
@@ -0,0 +1,187 @@
+"""
+Functions to read PDB/mmCIF files
+"""
+
+import logging
+import sys
+import typing
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+from Bio.PDB.Atom import DisorderedAtom
+from Bio.PDB.Chain import Chain
+from Bio.PDB.MMCIFParser import MMCIFParser
+from Bio.PDB.Model import Model
+from Bio.PDB.PDBExceptions import PDBConstructionWarning
+from Bio.PDB.PDBParser import PDBParser
+from Bio.PDB.Polypeptide import PPBuilder, is_aa
+from Bio.PDB.Structure import Structure
+
+warnings.filterwarnings("ignore", category=PDBConstructionWarning)
+log = logging.getLogger("Prodigy")
+
+
+def get_parser(input_f: Path) -> Union[PDBParser, MMCIFParser]:
+    if input_f.suffix == ".cif":
+        return MMCIFParser()
+    else:
+        return PDBParser()
+
+
+def ignore(r):
+    return r.id[0][0] == "W" or r.id[0][0] == "H"
+
+
+def validate_structure(
+    input_strcture_obj: Structure,
+    selection: Optional[list[str]] = None,
+    clean: bool = True,
+) -> list[Model]:
+
+    result: list[Model] = []
+    for model in [m for m in input_strcture_obj.child_list]:
+
+        # process selected chains
+        chains: list[Chain] = list(model.get_chains())
+        chain_ids = set([c.id for c in chains])
+
+        if selection:
+            sel_chains = []
+            # Match selected chain with structure
+            for sel in selection:
+                for c_str in sel.split(","):
+                    sel_chains.append(c_str)
+                    if c_str not in chain_ids:
+                        raise ValueError(
+                            f"Selected chain not present in provided structure: {c_str}"
+                        )
+
+            # Remove unselected chains
+            def _ignore_helper(x) -> bool:
+                return x.id not in sel_chains
+
+            for c in chains:
+                if _ignore_helper(c):
+                    if c.parent is not None:
+                        c.parent.detach_child(c.id)
+
+        # Double occupancy check
+        for atom in list(model.get_atoms()):
+            if atom.is_disordered():
+                atom = typing.cast(DisorderedAtom, atom)
+                residue = atom.parent
+                assert residue is not None
+                sel_at = atom.selected_child
+                assert sel_at is not None
+                sel_at.altloc = " "
+                sel_at.disordered_flag = 0
+                residue.detach_child(atom.id)
+                residue.add(sel_at)
+
+        # Insertion code check
+        for c in chains:
+            for residue in c.get_residues():
+                if residue.get_id()[2] != " ":
+                    c.detach_child(residue.id)
+
+        if clean:
+            # Remove HETATMs and solvent
+            res_list = list(model.get_residues())
+
+            for res in res_list:
+                if ignore(res):
+                    chain = res.parent
+                    assert chain is not None
+                    chain.detach_child(res.id)
+                elif not is_aa(res, standard=True):
+                    raise ValueError(
+                        "Unsupported non-standard amino acid found: {0}".format(
+                            res.resname
+                        )
+                    )
+
+            # Remove Hydrogens
+            atom_list = list(model.get_atoms())
+
+            def _ignore(x):
+                return x.element == "H"
+
+            for atom in atom_list:
+                if _ignore(atom):
+                    residue = atom.parent
+                    assert residue is not None
+                    residue.detach_child(atom.name)
+
+        # Detect gaps and compare with no. of chains
+        pep_builder = PPBuilder()
+        peptides = pep_builder.build_peptides(model)
+        n_peptides = len(peptides)
+
+        if n_peptides != len(chain_ids):
+            message = "[!] Structure contains gaps:\n"
+            for i_pp, pp in enumerate(peptides):
+                message += (
+                    "\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > "
+                    "{2.parent.id} {2.resname}{2.id[1]}\n".format(i_pp, pp[0], pp[-1])
+                )
+            log.warning(message)
+
+        result.append(model)
+
+    return result
+
+
+def parse_structure(path: str) -> tuple[list[Model], int, int]:
+    """Return a validated `Structure`, number of chains and number of residues"""
+
+    extension = Path(path).suffix
+    supported_extensions = [".pdb", ".cif", ".ent"]
+    if extension not in supported_extensions:
+        log.error(
+            f"[!] Structure format '{extension}' is "
+            "not supported. Use '.pdb' or '.cif'."
+        )
+        sys.exit(1)
+
+    parser = get_parser(Path(path))
+    structure_name = Path(path).stem
+    structure_path = Path(path)
+    try:
+        original_structure = parser.get_structure(structure_name, structure_path)
+    except Exception as e:
+        log.exception(e)
+        sys.exit(1)
+
+    assert isinstance(original_structure, Structure)
+
+    models: list[Model] = validate_structure(original_structure)
+
+    # Get number of chains
+    chain_dict = {}
+    res_dict = {}
+    for model in models:
+        chain_dict.update({c.id: c for c in model.get_chains()})
+        res_dict.update({r.id: r for r in model.get_residues()})
+
+    ## Make sure all models have the same chains
+    # Get chain sets for all models
+    chain_sets = [set(chain.id for chain in model.get_chains()) for model in models]
+
+    # Check if all sets are identical
+    if not all(chain_set == chain_sets[0] for chain_set in chain_sets):
+        raise ValueError(
+            "Not all models have the same chains. Found chain sets: "
+            + ", ".join(str(s) for s in chain_sets)
+        )
+
+    res_sets = [set(res.id for res in model.get_residues()) for model in models]
+
+    if not all(res_set == res_sets[0] for res_set in res_sets):
+        raise ValueError(
+            "Not all models have the same residues. Found residue sets: "
+            + ", ".join(str(s) for s in res_sets)
+        )
+
+    # structure, n_chains, n_res = parse_structure(path=str(struct_path))
+    return (models, len(chain_sets[0]), len(res_sets[0]))
--- a/src/prodigy_prot/modules/prodigy.py
+++ b/src/prodigy_prot/modules/prodigy.py
@@ -0,0 +1,301 @@
+import sys
+from io import TextIOWrapper
+from typing import Optional, TextIO, Union
+
+from Bio.PDB.Model import Model
+from Bio.PDB.NeighborSearch import NeighborSearch
+#from Bio.PDB.Structure import Structure
+
+from prodigy_prot.modules import aa_properties
+from prodigy_prot.modules.freesasa_tools import execute_freesasa_api
+from prodigy_prot.modules.models import IC_NIS
+from prodigy_prot.modules.utils import dg_to_kd
+
+
+def calculate_ic(
+    model: Model, d_cutoff: float = 5.5, selection: Optional[dict[str, int]] = None
+) -> list:
+    """
+    Calculates intermolecular contacts in a parsed struct object.
+    """
+    atom_list = list(model.get_atoms())
+    ns = NeighborSearch(atom_list)
+    all_list = ns.search_all(radius=d_cutoff, level="R")
+
+    assert all_list is not None
+
+    if selection:
+        _sd = selection
+
+        def _chain(x):
+            return x.parent.id
+
+        ic_list = [
+            c
+            for c in all_list
+            if (_chain(c[0]) in _sd and _chain(c[1]) in _sd)
+            and (_sd[_chain(c[0])] != _sd[_chain(c[1])])
+        ]
+    else:
+        ic_list = [c for c in all_list if c[0].parent.id != c[1].parent.id]
+
+    if not ic_list:
+        raise ValueError("No contacts found for selection")
+
+    ic_list.sort()
+    return ic_list
+
+
+def analyse_contacts(contact_list: list) -> dict[str, float]:
+    """
+    Enumerates and classifies contacts based on the chemical characteristics
+    of the participating amino acids.
+    """
+
+    bins = {
+        "AA": 0.0,
+        "PP": 0.0,
+        "CC": 0.0,
+        "AP": 0.0,
+        "CP": 0.0,
+        "AC": 0.0,
+        "LL": 0.0,
+        "BL": 0.0,
+        "BB": 0.0 
+    }
+
+    _data = aa_properties.aa_character_ic
+    for res_i, res_j in contact_list:
+        i = _data.get(res_i.resname)
+        j = _data.get(res_j.resname)
+        if i is not None and j is not None:
+            contact_type = "".join(sorted((i, j)))
+            bins[contact_type] += 1
+
+    _data = aa_properties.aa_character_hydro
+    for res_i, res_j in contact_list:
+        i = _data.get(res_i.resname)
+        j = _data.get(res_j.resname)
+        if i is not None and j is not None:
+            contact_type = "".join(sorted((i, j)))
+            bins[contact_type] += 1
+
+    return bins
+
+
+def analyse_nis(sasa_dict: dict, acc_threshold: float = 0.05) -> list[float]:
+    """
+    Returns the percentages of apolar, polar, and charged
+    residues at the interface, according to an accessibility
+    criterion.
+    """
+
+    _data = aa_properties.aa_character_protorp
+
+    def _char_to_index(x):
+        return {"A": 0, "C": 1, "P": 2}.get(x)
+
+    count = [0, 0, 0]
+
+    for res, rsa in sasa_dict.items():
+        _, resn, _ = res
+        if rsa >= acc_threshold:
+            aa_character = _data[resn]
+            aa_index = _char_to_index(aa_character)
+            assert aa_index is not None
+            count[aa_index] += 1
+
+    percentages = [100.0 * x / sum(count) for x in count]
+    return percentages
+
+
+class Prodigy:
+    # init parameters
+    def __init__(
+        self,
+        model: Model,
+        name: str = "",
+        selection: Optional[list[str]] = None,
+        temp: float = 25.0,
+    ):
+        self.temp = float(temp)
+        if selection is None:
+            self.selection = [chain.id for chain in model.get_chains()]
+        else:
+            self.selection = selection
+        self.model = model
+        self.name = name
+        self.ic_network: list = []
+        self.bins: dict[str, float] = {
+            "CC": 0.0,
+            "CP": 0.0,
+            "AC": 0.0,
+            "PP": 0.0,
+            "AP": 0.0,
+            "AA": 0.0,
+            "LL": 0.0,
+            "BL": 0.0,
+            "BB": 0.0
+        }
+
+        self.nis_a = 0.0
+        self.nis_c = 0.0
+        self.nis_p = 0.0
+        self.ba_val = 0.0
+        self.kd_val = 0.0
+
+    def predict(
+        self,
+        temp: Optional[float] = None,
+        distance_cutoff: float = 5.5,
+        acc_threshold: float = 0.05,
+    ):
+        if temp is not None:
+            self.temp = temp
+        # Make selection dict from user option or PDB chains
+        selection_dict: dict[str, int] = {}
+        for igroup, group in enumerate(self.selection):
+            chains = group.split(",")
+            for chain in chains:
+                if chain in selection_dict:
+                    errmsg = "Selections must be disjoint sets: " f"{chain} is repeated"
+                    raise ValueError(errmsg)
+                selection_dict[chain] = igroup
+
+        # Contacts
+        self.ic_network = calculate_ic(
+            self.model, d_cutoff=distance_cutoff, selection=selection_dict
+        )
+
+        self.bins = analyse_contacts(self.ic_network)
+        # SASA
+        _, cmplx_sasa = execute_freesasa_api(self.model)
+        self.nis_a, self.nis_c, self.nis_p = analyse_nis(cmplx_sasa, acc_threshold=acc_threshold)
+
+        # Affinity Calculation
+        self.ba_val = IC_NIS(
+            self.bins["CC"],
+            self.bins["AC"],
+            self.bins["PP"],
+            self.bins["AP"],
+            self.nis_a,
+            self.nis_c,
+        )
+        self.kd_val = dg_to_kd(self.ba_val, self.temp)
+
+    def as_dict(self) -> dict:
+        return_dict = {
+            "model": self.model.id,
+            "selection": self.selection,
+            "temp": self.temp,
+            "ICs": len(self.ic_network),
+            "nis_a": self.nis_a,
+            "nis_c": self.nis_c,
+            "nis_p": self.nis_p,
+            "ba_val": self.ba_val,
+            "kd_val": self.kd_val,
+        }
+        return_dict.update(self.bins)
+        return return_dict
+
+    def print_prediction(self, outfile: str = "", quiet: bool = False, showall: bool = False) -> None:
+        handle: Union[TextIOWrapper, TextIO]
+        if outfile:
+            handle = open(outfile, "w")
+        else:
+            handle = sys.stdout
+
+        if quiet:
+            handle.write("{0}\t{1:8.3f}\n".format(self.name, self.ba_val))
+        else:
+            # Collect output lines in order
+            lines = []
+            lines.append(f"[+] No. of intermolecular contacts: {len(self.ic_network)}\n")
+            lines.append(f"[+] No. of Charged-Charged contacts: {self.bins['CC']}\n")
+            lines.append(f"[+] No. of Charged-Polar contacts: {self.bins['CP']}\n")
+            lines.append(f"[+] No. of Charged-Apolar contacts: {self.bins['AC']}\n")
+            lines.append(f"[+] No. of Polar-Polar contacts: {self.bins['PP']}\n")
+            lines.append(f"[+] No. of Apolar-Polar contacts: {self.bins['AP']}\n")
+            lines.append(f"[+] No. of Apolar-Apolar contacts: {self.bins['AA']}\n")
+
+            if showall:
+                lines.append(f"[+] No. of hydrophiLic-hydrophiLic contacts: {self.bins['LL']}\n")
+                lines.append(f"[+] No. of hydrophoBic-hydrophiLic contacts: {self.bins['BL']}\n")
+                lines.append(f"[+] No. of hydrophoBic-hydrophoBic contacts: {self.bins['BB']}\n")
+                lines.append(f"[+] Percentage of Polar NIS residues: {self.nis_p:3.2f}\n")
+
+            lines.append(f"[+] Percentage of Apolar NIS residues: {self.nis_a:3.2f}\n")
+            lines.append(f"[+] Percentage of Charged NIS residues: {self.nis_c:3.2f}\n")
+            lines.append(f"[++] predicted binding affinity (kcal.mol-1): {self.ba_val:8.1f}\n")
+            lines.append(f"[++] predicted dissociation constant (M) at {self.temp:.1f}˚C: {self.kd_val:8.1e}\n")
+
+            handle.writelines(lines)
+
+        if handle is not sys.stdout:
+            handle.close()
+
+    def print_contacts(self, outfile: str = "") -> None:
+        handle: Union[TextIOWrapper, TextIO]
+        if outfile:
+            handle = open(outfile, "w")
+        else:
+            handle = sys.stdout
+
+        for res1, res2 in self.ic_network:
+            _fmt_str = (
+                "{0.resname:>5s} {0.id[1]:5} {0.parent.id:>3s} {1.resname:>5s}"
+                " {1.id[1]:5} {1.parent.id:>3s}\n"
+            )
+            if res1.parent.id not in self.selection[0]:
+                res1, res2 = res2, res1
+            handle.write(_fmt_str.format(res1, res2))
+
+        if handle is not sys.stdout:
+            handle.close()
+
+    def print_pymol_script(self, outfile: str = "") -> None:
+        # Writing output PYMOL: pml script
+        # initialize array with chains and save chain selection string
+        selection_strings = []
+        chains: dict[str, set] = {}
+        for s in self.selection:
+            selection_strings.append(s.replace(",", "+"))
+            for c in s.split(","):
+                chains[c] = set()
+
+        # loop over pairs and add interface residues to respective chains
+        for pair in self.ic_network:
+            for r in pair:
+                chains[r.parent.id].add(str(r.id[1]))
+
+        # set output stream
+        handle = open(outfile, "w") if outfile else sys.stdout
+
+        # write default setup strings
+        handle.writelines(
+            [
+                "color silver\n",
+                "as cartoon\n",
+                "bg_color white\n",
+                "center\n",
+                "color lightblue, chain {}\n".format(selection_strings[0]),
+                "color lightpink, chain {}\n".format(selection_strings[1]),
+            ]
+        )
+
+        # loop over interfaces construct selection strings
+        #  and write interface related commands
+        for color, iface in [("blue", 1), ("hotpink", 2)]:
+            p_sel_string = " or ".join(
+                [
+                    "chain {} and resi {}".format(c, "+".join(chains[c]))
+                    for c in selection_strings[iface - 1].split("+")
+                ]
+            )
+            handle.write("select iface{},  {}\n".format(iface, p_sel_string))
+            handle.write("color {}, iface{}\n".format(color, iface))
+            handle.write("show sticks, iface{}\n".format(iface))
+
+        # close file handle if applicable
+        if handle is not sys.stdout:
+            handle.close()
--- a/src/prodigy_prot/modules/utils.py
+++ b/src/prodigy_prot/modules/utils.py
@@ -0,0 +1,25 @@
+"""
+Assorted utility functions.
+"""
+
+import math
+import os
+
+
+def check_path(path: str) -> str:
+    """
+    Checks if a file is readable.
+    """
+
+    full_path = os.path.abspath(path)
+    if not os.path.isfile(full_path):
+        raise IOError("Could not read file: {0}".format(path))
+    return full_path
+
+
+def dg_to_kd(dg: float, temperature: float = 25.0) -> float:
+    """Coversion of DG into the dissociation constant kd"""
+
+    temp_in_k = temperature + 273.15
+    rt = 0.0019858775 * temp_in_k
+    return math.exp(dg / rt)
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+TEST_DATA = Path(Path(__file__).parents[0], "test_data")
--- a/tests/test_data/2oob.cif
+++ b/tests/test_data/2oob.cif
--- a/tests/test_data/2oob.pdb
+++ b/tests/test_data/2oob.pdb
--- a/tests/test_data/dataset.json
+++ b/tests/test_data/dataset.json
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+
+import pytest
+from Bio.PDB.MMCIFParser import MMCIFParser
+from Bio.PDB.PDBParser import PDBParser
+from Bio.PDB.Structure import Structure
+
+from prodigy_prot.modules.parsers import get_parser, parse_structure, validate_structure
+
+from . import TEST_DATA
+
+
+@pytest.fixture
+def input_structure_cif():
+    yield Path(TEST_DATA, "2oob.cif")
+
+
+@pytest.fixture
+def input_structure_pdb() -> Path:
+    return Path(TEST_DATA, "2oob.pdb")
+
+
+def test_get_parser_pdb(input_structure_pdb):
+
+    parser = get_parser(input_structure_pdb)
+    assert isinstance(parser, PDBParser)
+
+
+def test_get_parser_cif(input_structure_cif):
+
+    parser = get_parser(input_structure_cif)
+    assert isinstance(parser, MMCIFParser)
+
+
+def test_validate_structure_pdb(input_structure_pdb):
+
+    parser = PDBParser()
+    structure = parser.get_structure("test_structure", input_structure_pdb)
+    assert isinstance(structure, Structure)
+
+    result = validate_structure(structure)
+    assert result == structure.child_list
+
+
+def test_validate_structure_cif(input_structure_cif):
+
+    parser = MMCIFParser()
+    structure = parser.get_structure("test_structure", input_structure_cif)
+    assert isinstance(structure, Structure)
+
+    result = validate_structure(structure)
+    assert result == structure.child_list
+
+
+def test_parse_structure_pdb(input_structure_pdb):
+
+    parser = PDBParser()
+    structure = parser.get_structure(input_structure_pdb.stem, input_structure_pdb)
+    assert isinstance(structure, Structure)
+
+    result, num_chains, num_res = parse_structure(input_structure_pdb)
+
+    assert result == structure.child_list
+    assert num_chains == 2
+    assert num_res == 116
+
+
+def test_parse_structure_cif(input_structure_cif):
+
+    parser = MMCIFParser()
+    structure = parser.get_structure(input_structure_cif.stem, input_structure_cif)
+    assert isinstance(structure, Structure)
+
+    result, num_chains, num_res = parse_structure(input_structure_cif)
+
+    assert result == structure.child_list
+    assert num_chains == 2
+    assert num_res == 116
--- a/tests/test_prodigy.py
+++ b/tests/test_prodigy.py
@@ -0,0 +1,239 @@
+import json
+import tarfile
+import tempfile
+from io import BufferedReader, TextIOWrapper
+from os.path import basename, splitext
+from pathlib import Path
+
+import pytest
+from Bio.PDB.Model import Model
+from Bio.PDB.PDBParser import PDBParser
+from Bio.PDB.Residue import Residue
+from Bio.PDB.Structure import Structure
+
+from prodigy_prot.modules.parsers import validate_structure
+from prodigy_prot.modules.prodigy import (
+    Prodigy,
+    analyse_contacts,
+    analyse_nis,
+    calculate_ic,
+)
+
+from . import TEST_DATA
+
+
+@pytest.fixture
+def input_model():
+    input_f = Path(TEST_DATA, "2oob.pdb")
+    parser = PDBParser()
+    structure = parser.get_structure(input_f.stem, input_f)
+    assert isinstance(structure, Structure)
+    return structure.child_list[0]
+
+
+@pytest.fixture
+def compressed_dataset_f():
+    return Path(TEST_DATA, "dataset.tgz")
+
+
+@pytest.fixture
+def expected_dataset_json():
+    return Path(TEST_DATA, "dataset.json")
+
+
+@pytest.fixture
+def prodigy_class(input_model):
+    yield Prodigy(input_model)
+
+
+def test_calculate_ic(input_model):
+
+    result = calculate_ic(model=input_model, d_cutoff=5.5)
+
+    assert len(result) == 78
+
+    first_hit: tuple[Residue, Residue] = result[0]
+
+    assert first_hit[0].get_resname() == "ASN"
+    assert first_hit[1].get_resname() == "LYS"
+
+
+def test_calculate_ic_with_selection(input_model):
+
+    result = calculate_ic(model=input_model, d_cutoff=5.5, selection={"A": 0, "B": 1})
+
+    assert len(result) == 78
+
+    first_hit: tuple[Residue, Residue] = result[0]
+
+    assert first_hit[0].get_resname() == "ASN"
+    assert first_hit[1].get_resname() == "LYS"
+
+
+def test_analyse_contacts(input_model):
+
+    res_a = input_model["A"][(" ", 931, " ")]
+    res_b = input_model["B"][(" ", 6, " ")]
+    contact = (res_a, res_b)
+
+    test_input = [contact]
+
+    result = analyse_contacts(test_input)
+
+    expected_output = {
+        "AA": 0.0,
+        "PP": 0.0,
+        "CC": 0.0,
+        "AP": 0.0,
+        "CP": 1.0,
+        "AC": 0.0,
+        "LL": 1.0,
+        "BL": 0.0,
+        "BB": 0.0
+    }
+    assert result == expected_output
+
+
+def test_analyse_nis():
+
+    test_input = {("B", "ARG", "72"): 0.9}
+    apolar, polar, charged = analyse_nis(test_input)
+
+    assert apolar == 0.0
+    assert polar == 100.0
+    assert charged == 0.0
+
+
+def test_prodigy_predict(prodigy_class):
+
+    prodigy_class.predict()
+
+    assert prodigy_class.nis_a == pytest.approx(35.5, abs=1.0)
+    assert prodigy_class.nis_c == pytest.approx(38.0, abs=1.0)
+    assert prodigy_class.ba_val == pytest.approx(-6.2, abs=1.0)
+
+    # This is the actual prediction
+    assert prodigy_class.kd_val == pytest.approx(2.7e-5, abs=1e-6)
+
+
+def test_prodigy_as_dict(prodigy_class):
+
+    result = prodigy_class.as_dict()
+
+    assert isinstance(result, dict)
+    # 14 'original' + 3 hydro + 1 %NIS
+    assert len(result) == 18
+
+
+def test_prodigy_print_prediction(prodigy_class):
+
+    outfile = tempfile.NamedTemporaryFile(delete=False)
+    assert Path(outfile.name).stat().st_size == 0
+
+    prodigy_class.print_prediction(outfile.name)
+    assert Path(outfile.name).stat().st_size != 0
+
+    Path(outfile.name).unlink()
+
+
+def test_prodigy_print_prediction_quiet(prodigy_class):
+
+    outfile = tempfile.NamedTemporaryFile(delete=False)
+    assert Path(outfile.name).stat().st_size == 0
+
+    prodigy_class.print_prediction(outfile.name, True)
+    assert Path(outfile.name).stat().st_size != 0
+
+    Path(outfile.name).unlink()
+
+
+def test_prodigy_print_contacts(input_model, prodigy_class):
+
+    res_a = input_model["A"][(" ", 931, " ")]
+    res_b = input_model["B"][(" ", 6, " ")]
+    prodigy_class.ic_network = [(res_a, res_b)]
+
+    outfile = tempfile.NamedTemporaryFile(delete=False)
+    assert Path(outfile.name).stat().st_size == 0
+
+    prodigy_class.print_contacts(outfile.name)
+    assert Path(outfile.name).stat().st_size != 0
+
+    Path(outfile.name).unlink()
+
+
+def test_print_pymol_script(input_model, prodigy_class):
+    res_a = input_model["A"][(" ", 931, " ")]
+    res_b = input_model["B"][(" ", 6, " ")]
+    prodigy_class.ic_network = [(res_a, res_b)]
+
+    outfile = tempfile.NamedTemporaryFile(delete=False)
+    assert Path(outfile.name).stat().st_size == 0
+
+    prodigy_class.print_pymol_script(outfile.name)
+    assert Path(outfile.name).stat().st_size != 0
+
+    Path(outfile.name).unlink()
+
+
+@pytest.mark.integration
+def test_dataset_prediction(compressed_dataset_f, expected_dataset_json):
+    """
+    Test method to compare prediction for 80 dataset cases with
+        expected values.
+    """
+    # load expected data from json
+    with open(expected_dataset_json) as fh:
+        expected_data = json.load(fh)
+
+    # load dataset PDBs
+    dataset = tarfile.open(compressed_dataset_f)
+    parser = PDBParser(QUIET=True)
+
+    keys_equal = ["AA", "PP", "CC", "AP", "CP", "AC"]
+    diffs = {"ba_val": [], "nis_a": [], "nis_c": []}
+
+    # run prodigy for each dataset in the PDB
+    for entry in dataset:
+        s_name, s_ext = splitext(basename(entry.name))
+
+        # skip system files in archive
+        if not s_name.isalnum() or s_ext != ".pdb":
+            continue
+
+        handle = dataset.extractfile(entry)
+
+        # Wrap filehandle to ensure string file handle in Python 3
+        handle = TextIOWrapper(BufferedReader(handle))  # type: ignore
+
+        parsed_structure = parser.get_structure(s_name, handle)
+        assert isinstance(parsed_structure, Structure)
+
+        models = validate_structure(parsed_structure, selection=["A", "B"])
+
+        # Test for structure object
+        # Check if it's a list and all elements are Model objects
+        assert isinstance(models, list) and all(
+            isinstance(item, Model) for item in models
+        )
+        # assert isinstance(s, list[Model])
+
+        #  run prediction and retrieve result dict
+        for m in models:
+            prod = Prodigy(m, selection=["A", "B"])
+            prod.predict()
+            results = prod.as_dict()
+
+            # check for equality of prdicted interface residues
+            for k in keys_equal:
+                observed_value = results[k]
+                expected_value = expected_data[s_name][k]
+                assert observed_value == pytest.approx(expected_value)
+
+            # check that NIS and binding afinity values are within 2% of
+            #  expected values and add diffs for summary
+            for k in diffs.keys():
+                delta = abs(results[k] / expected_data[s_name][k] - 1)
+                # assume a difference of less then 2%
+                assert delta == pytest.approx(0, abs=0.02)
+                diffs[k].append(delta)
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -0,0 +1,21 @@
+import math
+import tempfile
+from pathlib import Path
+
+from prodigy_prot.modules.utils import check_path, dg_to_kd
+
+
+def test_check_path():
+
+    temp_f = tempfile.NamedTemporaryFile(delete=False)
+
+    result = check_path(temp_f.name)
+
+    assert result == temp_f.name
+
+    Path(temp_f.name).unlink()
+
+
+def test_dg_to_kd():
+
+    assert math.isclose(dg_to_kd(0.0), 1.0, rel_tol=1e-9)