Configure PRODIGY pipeline for WES execution with S3 and Harbor

2026-03-17 16:38:16 +01:00
commit 19fd443501
38 changed files with 16328 additions and 0 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,49 @@
 name: ci
 on: push
 jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
      fail-fast: false
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - run: pip install '.[dev]'
      - name: check types
        run: mypy .
      - name: run unittests
        run: >-
          pytest 
          -m "not integration"
          --cov
          --cov-report xml:coverage.xml
          --cov-append
          -vv
          --hypothesis-show-statistics
      - name: run integration tests
        run: >-
          pytest 
          -m integration
          --cov
          --cov-report xml:coverage.xml
          --cov-append
          -vv
          --hypothesis-show-statistics
      - name: Run codacy-coverage-reporter
        uses: codacy/codacy-coverage-reporter-action@v1
        with:
          project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
          coverage-reports: coverage.xml
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,48 @@
 #
 name: Create and publish a Docker image
 on:
  push:
    # run only against tags
    tags:
      - "*"
 env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}
 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
  build-and-push-image:
    runs-on: ubuntu-latest
    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
    permissions:
      contents: read
      packages: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
      - name: Log in to the Container registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
      - name: Extract metadata (tags, labels) for Docker
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
      - name: Build and push
        uses: docker/build-push-action@v5
        with:
          context: .
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,34 @@
 name: publish to pypi
 on:
  release:
    types: [published]
 jobs:
  pypi_release:
    name: builds and publishes to pypi
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/prodigy-prot
    permissions:
      id-token: write
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.13"
      - name: install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install --upgrade build
      - name: build
        run: |
          python -m build
      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,27 @@
 name: "Close stale issues and PRs"
 on:
  schedule:
    - cron: "30 1 * * *"
  workflow_dispatch:
 jobs:
  stale:
    runs-on: ubuntu-latest
    permissions:
      contents: write
      issues: write
      pull-requests: write
      actions: write
    steps:
      - uses: actions/stale@v10
        with:
          stale-pr-message: "This PR is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
          stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days."
          close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
          days-before-stale: 30
          days-before-close: 5
          exempt-issue-labels: "bug"
          exempt-pr-labels: "bug"
          remove-stale-when-updated: true
          operations-per-run: 100
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,13 @@
 work/
 .nextflow/
 .nextflow.log*
 *.log.*
 results/
 __pycache__/
 *.pyc
 .docker/
 .vscode/
 .idea/
 *.tmp
 *.swp
 tests/test_data/dataset.tgz
--- a/.howfairis.yml
+++ b/.howfairis.yml
@@ -0,0 +1,9 @@
 ## Uncomment a line if you want to skip a given category of checks
 #skip_repository_checks_reason: <reason for skipping goes here>
 #skip_license_checks_reason: <reason for skipping goes here>
 #skip_registry_checks_reason: <reason for skipping goes here>
 #skip_citation_checks_reason: <reason for skipping goes here>
 skip_checklist_checks_reason: "I'm using the Codacy dashboard to guide my development"
 ignore_commented_badges: false
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,47 @@
 # This CITATION.cff file was generated with cffinit.
 # Visit https://bit.ly/cffinit to generate yours today!
 cff-version: 1.2.0
 title: Prodigy
 message: >-
  If you use this software, please cite it using the
  metadata from this file.
 type: software
 authors:
  - given-names: Anna
    family-names: Vangone
    affiliation: Utrecht University
  - given-names: Alexandre
    name-particle: MJJ
    family-names: Bonvin
    affiliation: Utrecht University
  - given-names: Joerg
    family-names: Schaarschmidt
    affiliation: Utrecht University
  - given-names: Rodrigo
    family-names: Vargas Honorato
    affiliation: Utrecht University
  - given-names: Brian
    family-names: Jimenez
    affiliation: Utrecht University
  - given-names: Joao
    family-names: Rodrigues
    affiliation: Utrecht University
 identifiers:
  - type: doi
    value: 10.1093/bioinformatics/btw514
    description: DOI of the web service version
  - type: doi
    value: 10.7554/eLife.07454
  - type: doi
    value: 10.1016/j.jmb.2014.04.017
 repository-code: 'https://github.com/haddocking/prodigy'
 url: 'https://wenmr.science.uu.nl/prodigy'
 abstract: >-
  A tool to predict binding affinity values for
  protein-protein complexes from atomic structures.
 keywords:
  - binding affinity
  - computational biology
  - protein-protein
 license: Apache-2.0
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,132 @@
 # Contributor Covenant Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socio-economic status,
 nationality, personal appearance, race, caste, color, religion, or sexual
 identity and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 - Demonstrating empathy and kindness toward other people
 - Being respectful of differing opinions, viewpoints, and experiences
 - Giving and gracefully accepting constructive feedback
 - Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 - Focusing on what is best not just for us as individuals, but for the overall
  community
 Examples of unacceptable behavior include:
 - The use of sexualized language or imagery, and sexual attention or advances of
  any kind
 - Trolling, insulting or derogatory comments, and personal or political attacks
 - Public or private harassment
 - Publishing others' private information, such as a physical or email address,
  without their explicit permission
 - Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official e-mail address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
 `prodigy.bonvinlab@gmail.com`.
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series of
 actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or permanent
 ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior, harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant][homepage],
 version 2.1, available at
 [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
 Community Impact Guidelines were inspired by
 [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
 For answers to common questions about this code of conduct, see the FAQ at
 [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
 [https://www.contributor-covenant.org/translations][translations].
 [homepage]: https://www.contributor-covenant.org
 [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
 [Mozilla CoC]: https://github.com/mozilla/diversity
 [FAQ]: https://www.contributor-covenant.org/faq
 [translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,17 @@
 # Contributing with PRODIGY
 ## Reporting issues
 If you find a bug or have a feature request, please report it in the [issue tracker](https://github.com/haddocking/prodigy/issues)
 ## Contributing code
 We welcome contributions to PRODIGY. If you would like to contribute, please fork the repository and make a pull request.
 ## Development conventions
 Please refer to the [development guidelines](DEVELOPMENT.md) for more details.
 ## Contact
 If you have any questions, please contact us at [ask.bioexcel.eu](https://ask.bioexcel.eu)
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -0,0 +1,36 @@
 # PRODIGY Development
 ## Installation
 We use `poetry` to manage the dependencies and the virtual environment, so you need to install it first; check the [official documentation](https://python-poetry.org/docs/#installation) for more details.
 Clone the repository and install the dependencies:
 ```text
 git clone https://github.com/haddocking/prodigy.git && cd prodigy
 poetry install
 ```
 ## Testing
 To run the tests, use the following command:
 ```text
 python -m unittest
 ```
 ## Code style
 We use `trunk` as the "all-purpose" linting tool, check its [documentation](https://docs.trunk.io/docs/install).
 To check for code style issues, run:
 ```text
 trunk check
 ```
 To automatically fix the issues, run:
 ```text
 trunk fmt
 ```
--- a/40
+++ b/40
@@ -0,0 +1,40 @@
 FROM python:3.12
 LABEL maintainer="Omic"
 LABEL description="PRODIGY - PROtein binDIng enerGY prediction"
 LABEL version="2.4.0"
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 # Install system dependencies required for freesasa compilation
 RUN apt-get update -y && \
    apt-get install -y --no-install-recommends \
    build-essential \
    gcc \
    g++ \
    make \
    procps \
    && rm -rf /var/lib/apt/lists/*
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip
 # Install PRODIGY and its dependencies
 # Dependencies: biopython>=1.80, freesasa>=2.2.1, numpy>=2
 RUN pip install --no-cache-dir \
    "biopython>=1.80" \
    "freesasa>=2.2.1" \
    "numpy>=2"
 # Install PRODIGY
 RUN pip install --no-cache-dir prodigy-prot==2.4.0
 # Verify installation
 RUN prodigy --help
 # Set working directory
 WORKDIR /data
 CMD ["prodigy", "--help"]
--- a/190
+++ b/190
@@ -0,0 +1,190 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   Copyright 2015 Anna Vangone, Panagiotis Kastritis, Alexandre Bonvin
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
 include README.md
 include src/prodigy_prot/data/naccess.config
--- a/README.md
+++ b/README.md
@@ -0,0 +1,341 @@
 # PRODIGY Nextflow Pipeline
 A Nextflow pipeline for predicting binding affinity of protein-protein complexes using PRODIGY (PROtein binDIng enerGY prediction).
 ## Overview
 PRODIGY is a contact-based method for predicting the binding affinity of protein-protein complexes from their 3D structures. This pipeline containerizes PRODIGY using Docker and orchestrates execution through Nextflow, enabling reproducible, scalable analysis of protein-protein interactions.
 ### Key Features
 - **Automated binding affinity prediction** from PDB/mmCIF structures
 - **Batch processing** of multiple protein complexes
 - **Docker containerization** for reproducibility
 - **Configurable parameters** for distance cutoffs, temperature, and chain selection
 - **Optional outputs** including contact lists and PyMOL visualization scripts
 ## Scientific Background
 PRODIGY predicts binding affinity by analyzing intermolecular contacts (ICs) at protein-protein interfaces. The method:
 1. Identifies residue-residue contacts within a distance threshold (default: 5.5 Å)
 2. Classifies contacts by residue type (charged, polar, apolar)
 3. Analyzes the non-interacting surface (NIS) composition
 4. Predicts binding free energy (ΔG) and dissociation constant (Kd)
 The 5.5 Å distance cutoff was optimized to capture various non-bonded interactions including salt bridges, hydrogen bonds, and hydrophobic contacts.
 ## Requirements
 ### Software Dependencies
 - [Nextflow](https://www.nextflow.io/) (≥21.04.0)
 - [Docker](https://www.docker.com/) (≥20.10) or [Singularity](https://sylabs.io/singularity/) (≥3.0)
 ### Hardware Requirements
 - CPU: 1+ cores per process
 - Memory: 4 GB minimum recommended
 - Storage: ~2 GB for Docker image
 ## Installation
 ### 1. Clone or Download the Pipeline
 ```bash
 # Create pipeline directory
 mkdir -p /path/to/prodigy_pipeline
 cd /path/to/prodigy_pipeline
 # Copy pipeline files (Dockerfile, main.nf, nextflow.config, params.json)
 ```
 ### 2. Build the Docker Image
 ```bash
 docker build -t prodigy:latest .
 ```
 ### 3. Verify Installation
 ```bash
 # Test Docker image
 docker run --rm prodigy:latest prodigy --help
 # Test Nextflow
 nextflow run main.nf --help
 ```
 ## Usage
 ### Basic Usage
 ```bash
 # Run on a single PDB file
 nextflow run main.nf --pdb /path/to/complex.pdb --outdir /path/to/output
 # Run on multiple PDB files
 nextflow run main.nf --pdb '/path/to/structures/*.pdb' --outdir /path/to/output
 ```
 ### With Custom Parameters
 ```bash
 nextflow run main.nf \
    --pdb '/path/to/structures/*.pdb' \
    --outdir /path/to/output \
    --distance_cutoff 5.5 \
    --acc_threshold 0.05 \
    --temperature 37.0 \
    --contact_list true \
    --pymol_selection true
 ```
 ### Chain Selection for Complex Interfaces
 For antibody-antigen complexes or multi-chain proteins:
 ```bash
 # Contacts between chains A and B only
 nextflow run main.nf --pdb complex.pdb --selection 'A B'
 # Heavy (H) and Light (L) chains as one molecule vs Antigen (A)
 nextflow run main.nf --pdb antibody_antigen.pdb --selection 'H,L A'
 # Three-way interface calculation
 nextflow run main.nf --pdb complex.pdb --selection 'A B C'
 ```
 ### Using Singularity
 ```bash
 nextflow run main.nf -profile singularity --pdb /path/to/complex.pdb
 ```
 ## Parameters
 ### Required Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--pdb` | Path to input PDB/mmCIF file(s). Supports glob patterns. | `/mnt/OmicNAS/private/old/olamide/Prodigy/input/*.pdb` |
 | `--outdir` | Output directory for results | `/mnt/OmicNAS/private/old/olamide/Prodigy/output` |
 ### Analysis Parameters
 | Parameter | Description | Default | Range |
 |-----------|-------------|---------|-------|
 | `--distance_cutoff` | Distance threshold (Å) for defining intermolecular contacts | `5.5` | 1.0 - 20.0 |
 | `--acc_threshold` | Relative accessibility threshold for surface residue identification | `0.05` | 0.0 - 1.0 |
 | `--temperature` | Temperature (°C) for Kd calculation | `25.0` | -273.15 - 100.0 |
 | `--selection` | Chain selection for interface calculation | `''` (all chains) | See examples |
 ### Output Control Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--contact_list` | Generate detailed contact list file | `false` |
 | `--pymol_selection` | Generate PyMOL visualization script | `false` |
 | `--quiet` | Output only affinity values (minimal output) | `false` |
 ## Output Files
 ### Standard Output
 For each input structure `<name>.pdb`, the pipeline generates:
 | File | Description |
 |------|-------------|
 | `<name>_prodigy.txt` | Main results file with binding affinity prediction |
 ### Optional Output (when enabled)
 | File | Description | Parameter |
 |------|-------------|-----------|
 | `<name>_contacts.txt` | List of all interface contacts | `--contact_list true` |
 | `<name>_interface.pml` | PyMOL script for interface visualization | `--pymol_selection true` |
 ### Example Output
 ```
 [!] Structure contains gaps:
    E ILE16 < Fragment 0 > E ALA183
    E TYR184 < Fragment 1 > E GLY187
 [+] Executing 1 task(s) in total
 ##########################################
 [+] Processing structure 1ppe_model0
 [+] No. of intermolecular contacts: 86
 [+] No. of charged-charged contacts: 5.0
 [+] No. of charged-polar contacts: 10.0
 [+] No. of charged-apolar contacts: 27.0
 [+] No. of polar-polar contacts: 0.0
 [+] No. of apolar-polar contacts: 20.0
 [+] No. of apolar-apolar contacts: 24.0
 [+] Percentage of apolar NIS residues: 34.10
 [+] Percentage of charged NIS residues: 18.50
 [++] Predicted binding affinity (kcal.mol-1):    -14.7
 [++] Predicted dissociation constant (M) at 25.0˚C:  1.6e-11
 ```
 ### Output Interpretation
 | Metric | Description |
 |--------|-------------|
 | **Intermolecular contacts** | Total number of residue-residue contacts at interface |
 | **Contact types** | Breakdown by residue character (charged/polar/apolar) |
 | **NIS residues** | Composition of non-interacting surface |
 | **Binding affinity (ΔG)** | Predicted free energy of binding (kcal/mol). More negative = stronger binding |
 | **Dissociation constant (Kd)** | Predicted Kd at specified temperature. Lower = tighter binding |
 ### Binding Affinity Scale
 | ΔG (kcal/mol) | Kd (M) | Binding Strength |
 |---------------|--------|------------------|
 | -6 to -8 | 10⁻⁵ to 10⁻⁶ | Moderate |
 | -8 to -10 | 10⁻⁶ to 10⁻⁷ | Strong |
 | -10 to -12 | 10⁻⁷ to 10⁻⁹ | Very Strong |
 | < -12 | < 10⁻⁹ | Extremely Strong |
 ## Test Data
 Download example protein complexes from the RCSB PDB:
 ```bash
 # Create input directory
 mkdir -p /mnt/OmicNAS/private/old/olamide/Prodigy/input
 # Download test structures
 wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/3bzd.pdb https://files.rcsb.org/download/3BZD.pdb
 wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/2oob.pdb https://files.rcsb.org/download/2OOB.pdb
 wget -O /mnt/OmicNAS/private/old/olamide/Prodigy/input/1ppe.pdb https://files.rcsb.org/download/1PPE.pdb
 ```
 ### Expected Results
 | Structure | Description | Expected ΔG (kcal/mol) |
 |-----------|-------------|------------------------|
 | 3BZD | Protein-protein complex | -9.4 |
 | 2OOB | Protein-protein complex | -6.2 |
 | 1PPE | Trypsin-inhibitor complex | -14.7 |
 ## Pipeline Structure
 ```
 prodigy_pipeline/
 ├── Dockerfile          # Docker image definition
 ├── main.nf             # Nextflow pipeline script
 ├── nextflow.config     # Pipeline configuration
 ├── params.json         # Parameter documentation
 └── README.md           # This file
 ```
 ## Docker Image Details
 The Docker image is based on Python 3.12 and includes:
 - **prodigy-prot** (v2.4.0) - Main PRODIGY package
 - **biopython** (≥1.80) - PDB structure parsing
 - **freesasa** (≥2.2.1) - Solvent accessible surface area calculation
 - **numpy** (≥2) - Numerical computations
 ### Building the Image
 ```bash
 docker build -t prodigy:latest .
 ```
 ### Running Standalone
 ```bash
 # Run PRODIGY directly
 docker run --rm -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
 # Get help
 docker run --rm prodigy:latest prodigy --help
 ```
 ## Troubleshooting
 ### Common Issues
 **1. Docker Hub Rate Limit Error**
 ```
 ERROR: toomanyrequests: You have reached your pull rate limit
 ```
 Solution: Log in to Docker Hub with `docker login` or wait and retry.
 **2. Structure Contains Gaps Warning**
 ```
 [!] Structure contains gaps
 ```
 This is informational, not an error. PRODIGY handles missing residues automatically.
 **3. No Intermolecular Contacts Found**
 - Verify the structure contains multiple chains
 - Check chain selection parameters
 - Ensure chains are in contact (within distance cutoff)
 **4. Permission Denied Errors**
 ```bash
 # Run with user permissions
 docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data prodigy:latest prodigy /data/complex.pdb
 ```
 ### Getting Help
 ```bash
 # PRODIGY help
 docker run --rm prodigy:latest prodigy --help
 # Nextflow pipeline help
 nextflow run main.nf --help
 ```
 ## Citation
 If you use this pipeline, please cite the following publications:
 ### PRODIGY Method
 1. **Xue LC, Rodrigues JP, Kastritis PL, Bonvin AM, Vangone A.** (2016)
   PRODIGY: a web server for predicting the binding affinity of protein-protein complexes.
   *Bioinformatics*, 32(23):3676-3678.
   [DOI: 10.1093/bioinformatics/btw514](https://doi.org/10.1093/bioinformatics/btw514)
 2. **Vangone A, Bonvin AM.** (2015)
   Contacts-based prediction of binding affinity in protein-protein complexes.
   *eLife*, 4:e07454.
   [DOI: 10.7554/eLife.07454](https://doi.org/10.7554/eLife.07454)
 3. **Kastritis PL, Rodrigues JP, Folkers GE, Boelens R, Bonvin AM.** (2014)
   Proteins feel more than they see: Fine-tuning of binding affinity by properties of the non-interacting surface.
   *Journal of Molecular Biology*, 426(14):2632-2652.
   [DOI: 10.1016/j.jmb.2014.04.017](https://doi.org/10.1016/j.jmb.2014.04.017)
 ### Software Dependencies
 - **Nextflow**: Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. *Nature Biotechnology*, 35:316-319.
 - **Biopython**: Cock PJ, et al. (2009) Biopython: freely available Python tools for computational molecular biology and bioinformatics. *Bioinformatics*, 25(11):1422-1423.
 - **FreeSASA**: Mitternacht S. (2016) FreeSASA: An open source C library for solvent accessible surface area calculations. *F1000Research*, 5:189.
 ## License
 This pipeline is distributed under the Apache License 2.0, consistent with the PRODIGY software license.
 ## Links
 - **PRODIGY Web Server**: [https://wenmr.science.uu.nl/prodigy/](https://wenmr.science.uu.nl/prodigy/)
 - **PRODIGY GitHub**: [https://github.com/haddocking/prodigy](https://github.com/haddocking/prodigy)
 - **BonvinLab**: [https://www.bonvinlab.org/](https://www.bonvinlab.org/)
 - **Nextflow**: [https://www.nextflow.io/](https://www.nextflow.io/)
 ## Support
 For questions about:
 - **PRODIGY method**: Contact the BonvinLab team at [ask.bioexcel.eu](https://ask.bioexcel.eu/)
 - **This pipeline**: Open an issue in the repository
 ---
 *Pipeline version: 2.4.0 | Last updated: January 2026*
--- a/examples/3BZD.ic_model
+++ b/examples/3BZD.ic_model
@@ -0,0 +1,13 @@
 [+] Reading structure file: /Users/joao/software/binding_affinity/examples/3BZD.pdb
 [+] Parsed structure file 3BZD (2 chains, 343 residues)
 [+] No. of intermolecular contacts: 51
 [+] No. of charged-charged contacts: 4
 [+] No. of charged-polar contacts: 7
 [+] No. of charged-apolar contacts: 6
 [+] No. of polar-polar contacts: 7
 [+] No. of apolar-polar contacts: 15
 [+] No. of apolar-apolar contacts: 12
 [+] Percentage of apolar NIS residues: 29.48
 [+] Percentage of charged NIS residues: 29.48
 [++] Predicted binding affinity (kcal.mol-1):   -9.373
 [++] Predicted dissociation constant (M): 1.333e-07
--- a/examples/3BZD.pdb
+++ b/examples/3BZD.pdb
--- a/examples/3bzd.cif
+++ b/examples/3bzd.cif
--- a/main.nf
+++ b/main.nf
@@ -0,0 +1,74 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl=2
 // Default parameters
 params.pdb = 's3://omic/eureka/prodigy/input/*.pdb'
 params.outdir = 's3://omic/eureka/prodigy/output'
 params.distance_cutoff = 5.5
 params.acc_threshold = 0.05
 params.temperature = 25.0
 params.selection = ''
 params.contact_list = false
 params.pymol_selection = false
 params.quiet = false
 // =============================================================================
 // Process: PRODIGY
 // Predicts binding affinity using intermolecular contacts
 // =============================================================================
 process PRODIGY {
    container 'harbor.cluster.omic.ai/omic/prodigy:latest'
    publishDir params.outdir, mode: 'copy'
    stageInMode 'copy'
    input:
        path pdb
    output:
        path "${pdb.baseName}_prodigy.txt", emit: results
        path "${pdb.baseName}_contacts.txt", optional: true, emit: contacts
        path "${pdb.baseName}_interface.pml", optional: true, emit: pymol
    script:
        """
        prodigy \\
            ${pdb} \\
            --distance-cutoff ${params.distance_cutoff} \\
            --acc-threshold ${params.acc_threshold} \\
            --temperature ${params.temperature} \\
            ${params.selection ? '--selection ' + params.selection : ''} \\
            ${params.contact_list ? '--contact_list' : ''} \\
            ${params.pymol_selection ? '--pymol_selection' : ''} \\
            ${params.quiet ? '--quiet' : ''} \\
            2>&1 | tee ${pdb.baseName}_prodigy.txt
        # Rename contact list file if generated
        if [ -f "${pdb.baseName}.contacts" ]; then
            mv ${pdb.baseName}.contacts ${pdb.baseName}_contacts.txt
        fi
        # Rename PyMOL script if generated
        if [ -f "${pdb.baseName}.pml" ]; then
            mv ${pdb.baseName}.pml ${pdb.baseName}_interface.pml
        fi
        """
 }
 // =============================================================================
 // Workflow
 // =============================================================================
 workflow {
    // Validate input
    if (!params.pdb) {
        error "ERROR: Please provide input PDB file(s) using --pdb parameter"
    }
    // Create input channel
    pdb_ch = Channel.fromPath(params.pdb, checkIfExists: true)
    // Run PRODIGY
    PRODIGY(pdb_ch)
 }
--- a/nextflow.config
+++ b/nextflow.config
@@ -0,0 +1,71 @@
 // =============================================================================
 // PRODIGY Nextflow Pipeline Configuration
 // Protein binding affinity prediction from structural data
 // =============================================================================
 // Manifest for Nextflow metadata
 manifest {
    name = 'PRODIGY-Nextflow'
    author = 'Olamide'
    homePage = 'https://trs-gitea.cluster.omic.ai/omic/prodigy'
    description = 'Nextflow pipeline for PRODIGY - Protein binding affinity prediction based on intermolecular contacts'
    mainScript = 'main.nf'
    version = '2.4.0'
 }
 // Global default parameters
 params {
    pdb = 's3://omic/eureka/prodigy/input/*.pdb'
    outdir = 's3://omic/eureka/prodigy/output'
    distance_cutoff = 5.5
    acc_threshold = 0.05
    temperature = 25.0
    selection = ''
    contact_list = false
    pymol_selection = false
    quiet = false
 }
 // Container configurations
 docker {
    enabled = true
    runOptions = '-u $(id -u):$(id -g)'
 }
 // Process configurations
 process {
    cpus = 1
    memory = '4 GB'
    container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
 }
 // Execution configurations
 executor {
    $local {
        cpus = 4
        memory = '8 GB'
    }
 }
 // Profiles for different execution environments
 profiles {
    standard {
        docker.enabled = true
    }
    k8s {
        docker.enabled = true
        process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
    }
    k8s_gpu {
        docker.enabled = true
        process.container = 'harbor.cluster.omic.ai/omic/prodigy:latest'
    }
    singularity {
        singularity.enabled = true
        singularity.autoMounts = true
        docker.enabled = false
    }
 }
--- a/params.json
+++ b/params.json
@@ -0,0 +1,157 @@
 {
    "params": {
        "pdb": {
            "type": "file",
            "description": "Path to input PDB or mmCIF structure file(s) for binding affinity prediction",
            "default": "s3://omic/eureka/prodigy/input/*.pdb",
            "required": true,
            "pipeline_io": "input",
            "var_name": "params.pdb",
            "examples": [
                "s3://omic/eureka/prodigy/input/3bzd.pdb",
                "s3://omic/eureka/prodigy/input/*.pdb"
            ],
            "pattern": ".*\\.(pdb|cif)$",
            "enum": [],
            "validation": {},
            "notes": "Input protein-protein complex structure in PDB or mmCIF format. Can be a single file or glob pattern for batch processing."
        },
        "outdir": {
            "type": "folder",
            "description": "Directory for PRODIGY prediction results",
            "default": "s3://omic/eureka/prodigy/output",
            "required": true,
            "pipeline_io": "output",
            "var_name": "params.outdir",
            "examples": [
                "s3://omic/eureka/prodigy/output",
                "s3://omic/eureka/prodigy/custom_output"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Directory where prediction results will be stored. Created if it does not exist."
        },
        "distance_cutoff": {
            "type": "float",
            "description": "Distance cutoff (Angstrom) for calculating intermolecular contacts",
            "default": 5.5,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.distance_cutoff",
            "examples": [
                5.5,
                4.0,
                6.0
            ],
            "pattern": null,
            "enum": [],
            "validation": {
                "min": 1.0,
                "max": 20.0
            },
            "notes": "Default value of 5.5 Angstrom was optimized in Vangone & Bonvin (2015) eLife. This threshold includes different non-bonded interactions including salt bridges."
        },
        "acc_threshold": {
            "type": "float",
            "description": "Accessibility threshold for buried surface area (BSA) analysis",
            "default": 0.05,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.acc_threshold",
            "examples": [
                0.05,
                0.1
            ],
            "pattern": null,
            "enum": [],
            "validation": {
                "min": 0.0,
                "max": 1.0
            },
            "notes": "Relative accessibility threshold used to identify surface residues for non-interacting surface (NIS) calculations."
        },
        "temperature": {
            "type": "float",
            "description": "Temperature (Celsius) for dissociation constant (Kd) prediction",
            "default": 25.0,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.temperature",
            "examples": [
                25.0,
                37.0,
                4.0
            ],
            "pattern": null,
            "enum": [],
            "validation": {
                "min": -273.15,
                "max": 100.0
            },
            "notes": "Temperature used to convert predicted binding free energy (deltaG) to dissociation constant (Kd)."
        },
        "selection": {
            "type": "string",
            "description": "Chain selection for interface calculation",
            "default": "",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.selection",
            "examples": [
                "A B",
                "A,B C",
                "H,L A"
            ],
            "pattern": null,
            "enum": [],
            "validation": {},
            "notes": "Specify chains to consider for binding affinity calculation. Format: 'A B' calculates contacts between chains A and B. 'A,B C' treats chains A and B as one molecule interacting with chain C. Useful for antibody-antigen complexes where heavy and light chains should be grouped."
        },
        "contact_list": {
            "type": "boolean",
            "description": "Output list of intermolecular contacts",
            "default": false,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.contact_list",
            "examples": [
                true,
                false
            ],
            "enum": [true, false],
            "validation": {},
            "notes": "When enabled, outputs a detailed list of all residue-residue contacts at the interface."
        },
        "pymol_selection": {
            "type": "boolean",
            "description": "Output PyMOL script to visualize interface",
            "default": false,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.pymol_selection",
            "examples": [
                true,
                false
            ],
            "enum": [true, false],
            "validation": {},
            "notes": "When enabled, generates a PyMOL script (.pml) to highlight interface residues for visualization."
        },
        "quiet": {
            "type": "boolean",
            "description": "Output only predicted affinity values",
            "default": false,
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.quiet",
            "examples": [
                true,
                false
            ],
            "enum": [true, false],
            "validation": {},
            "notes": "When enabled, outputs only the predicted binding affinity value without detailed analysis. Useful for batch processing and downstream parsing."
        }
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,45 @@
 [project]
 name = "prodigy-prot"
 license = "Apache-2.0"
 version = "2.4.0"
 description = "PROtein binDIng enerGY prediction"
 authors = [
  { name = "Anna Vangone" },
  { name = "Joao Rodrigues" },
  { name = "Joerg Schaarschmidt" },
 ]
 maintainers = [{ name = "BonvinLab", email = "bonvinlab.support@uu.nl" }]
 readme = "README.md"
 classifiers = [
  "Development Status :: 5 - Production/Stable",
  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
  "Topic :: Scientific/Engineering :: Chemistry",
  "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
 dependencies = ["biopython>=1.80", "freesasa>=2.2.1", "numpy>=2"]
 [project.optional-dependencies]
 dev = ["pytest", "coverage", "hypothesis", "pytest-cov", "mypy"]
 [project.scripts]
 prodigy = "prodigy_prot.cli:main"
 [tool.setuptools]
 include-package-data = true
 packages = ["src"]
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 markers = ["integration: marks tests as integration tests"]
 [tool.mypy]
 disable_error_code = ["import-not-found"]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
--- a/src/prodigy_prot/init.py
+++ b/src/prodigy_prot/init.py
@@ -0,0 +1,3 @@
 from pathlib import Path
 NACCESS_CONFIG = Path(Path(__file__).parents[0], "data/naccess.config")
--- a/src/prodigy_prot/cli.py
+++ b/src/prodigy_prot/cli.py
@@ -0,0 +1,199 @@
 """
 Binding affinity predictor based on Intermolecular Contacts (ICs).
 """
 import argparse
 import logging
 import sys
 from argparse import RawTextHelpFormatter
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from io import StringIO
 from pathlib import Path
 from Bio.PDB.Model import Model
 from prodigy_prot.modules.parsers import parse_structure
 from prodigy_prot.modules.prodigy import Prodigy
 # setup logging
 logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
 log = logging.getLogger("Prodigy")
 ap = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
 ap.add_argument(
    "input_path",
    help="Path to either: \n- Structure in PDB or mmCIF format\n- Directory containing structure files",
 )
 ap.add_argument(
    "--distance-cutoff",
    type=float,
    default=5.5,
    help="Distance cutoff to calculate ICs",
 )
 ap.add_argument(
    "--acc-threshold",
    type=float,
    default=0.05,
    help="Accessibility threshold for BSA analysis",
 )
 ap.add_argument(
    "--temperature",
    type=float,
    default=25.0,
    help="Temperature (C) for Kd prediction",
 )
 ap.add_argument("--contact_list", action="store_true", help="Output a list of contacts")
 ap.add_argument(
    "--pymol_selection",
    action="store_true",
    help="Output a script to highlight the interface (pymol)",
 )
 ap.add_argument(
    "-q",
    "--quiet",
    action="store_true",
    help="Outputs only the predicted affinity value",
 )
 ap.add_argument(
    "-s",
    "--showall",
    action="store_true",
    help="Outputs all original prodigy features but BSA (mutually exclusive with `-q`)",
 )
 ap.add_argument(
    "-np",
    "--number-of-processors",
    type=int,
    action="store",
    help="Number of processors to use (default: 1)",
    default=1,
 )
 _co_help = """
 By default, all intermolecular contacts are taken into consideration,
 a molecule being defined as an isolated group of amino acids sharing
 a common chain identifier. In specific cases, for example
 antibody-antigen complexes, some chains should be considered as a
 single molecule.
 Use the --selection option to provide collections of chains that should
 be considered for the calculation. Separate by a space the chains that
 are to be considered _different_ molecules. Use commas to include multiple
 chains as part of a single group:
 --selection A B => Contacts calculated (only) between chains A and B.
 --selection A,B C => Contacts calculated (only) between \
    chains A and C; and B and C.
 --selection A B C => Contacts calculated (only) between \
    chains A and B; B and C; and A and C.
 """
 sel_opt = ap.add_argument_group("Selection Options", description=_co_help)
 sel_opt.add_argument("--selection", nargs="+", metavar=("A B", "A,B C"))
 def main():
    args = ap.parse_args()
    log.setLevel(logging.ERROR if args.quiet else logging.INFO)
    if args.quiet and args.showall:
        log.error("Error: --quiet (-q) and --showall (-s) are mutually exclusive arguments")
        sys.exit(1)
    log.setLevel(logging.ERROR if args.quiet else logging.INFO)
    struct_path = Path(args.input_path)
    input_list = []
    if struct_path.is_file():
        input_list.append(struct_path)
    elif struct_path.is_dir():
        for input_f in struct_path.glob("*"):
            if Path(input_f).suffix in [".pdb", ".cif", ".ent"]:
                input_list.append(input_f)
    elif not struct_path.exists():
        log.error(f"File {struct_path} does not exist")
        sys.exit(1)
    else:
        log.error(f"Input path {struct_path} is neither a valid file nor a directory")
        sys.exit(1)
    # Collect all tasks
    tasks = []
    for input_f in input_list:
        models, _, _ = parse_structure(str(input_f))
        struct_path = Path(input_f)
        for model in models:
            identifier = f"{struct_path.stem}_model{model.id}"
            tasks.append((model, identifier, args, struct_path))
    # Execute in parallel
    total_tasks = len(tasks)
    if total_tasks == 0:
        log.error("No valid structures found")
        sys.exit(1)
    max_workers = min(args.number_of_processors, total_tasks)
    log.info(f"[+] Executing {total_tasks} task(s) in total")
    if max_workers != args.number_of_processors:
        log.info("[+] Adjusting number of processors based on number of tasks")
        log.info(
            f"[+] Using {max_workers} processor(s) instead of {args.number_of_processors}"
        )
    # Execute and collect results
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_model, *task) for task in tasks]
        for future in as_completed(futures):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                log.error(f"Error processing model: {e}")
    # Sort by identifier, then model.id
    results.sort(key=lambda x: (x[0], x[1]))
    # Print all outputs sequentially
    for identifier, _, output in results:
        print(output, end="")
 def process_model(model: Model, identifier: str, args: argparse.Namespace, struct_path):
    """Process a single model"""
    # Capture stdout
    output_buffer = StringIO()
    old_stdout = sys.stdout
    sys.stdout = output_buffer
    try:
        if not args.quiet:
            print("#" * 42)
            print(f"[+] Processing structure {identifier}")
        prodigy = Prodigy(
            model=model,
            name=identifier,
            selection=args.selection,
            temp=args.temperature,
        )
        prodigy.predict(
            distance_cutoff=args.distance_cutoff, acc_threshold=args.acc_threshold
        )
        prodigy.print_prediction(quiet=args.quiet, showall=args.showall)
    finally:
        sys.stdout = old_stdout
    if args.contact_list:
        contact_list_f = struct_path.with_suffix(".ic")
        prodigy.print_contacts(outfile=str(contact_list_f))
    if args.pymol_selection:
        pymol_script_f = struct_path.with_suffix(".pml")
        prodigy.print_pymol_script(outfile=str(pymol_script_f))
    return identifier, model.id, output_buffer.getvalue()
 if __name__ == "__main__":
    sys.exit(main())
--- a/src/prodigy_prot/data/naccess.config
+++ b/src/prodigy_prot/data/naccess.config
@@ -0,0 +1,256 @@
 # Contributed by João Rodrigues 
 name: NACCESS
 types:
 C_ALI 1.87 apolar
 C_CAR 1.76 apolar
 C_NUC 1.80 apolar
 N_AMN 1.50 polar
 N_AMD 1.65 polar
 N_NUC 1.60 polar
 O 1.40 polar
 S 1.85 apolar
 SE 1.80 apolar
 P 1.90 apolar
 atoms:
 ANY C   C_CAR
 ANY O   O
 ANY CA  C_ALI
 ANY N   N_AMD
 ANY CB  C_ALI
 ANY OXT O
 # nucleic acid
 ANY P   P
 ANY OP1 O
 ANY OP2 O
 ANY OP3 O
 ANY O5' O
 ANY O4' O
 ANY O3' O
 ANY O2' O
 ANY C5' C_NUC
 ANY C4' C_NUC
 ANY C3' C_NUC
 ANY C2' C_NUC
 ANY C1' C_NUC
 ALA CB C_ALI # included so that RSA values will be generated
 ARG CG C_ALI
 ARG CD C_ALI
 ARG NE N_AMD
 ARG CZ C_CAR
 ARG NH1 N_AMD
 ARG NH2 N_AMD
 ASN CG  C_CAR
 ASN OD1 O
 ASN ND2 N_AMD
 ASP CG  C_CAR
 ASP OD1 O
 ASP OD2 O
 CYS SG  S
 GLN CG  C_ALI
 GLN CD  C_CAR
 GLN OE1 O
 GLN NE2 N_AMD
 GLU CG  C_ALI
 GLU CD  C_CAR
 GLU OE1 O
 GLU OE2 O
 GLY CA C_ALI # included so that RSA values will be generated
 HIS CG  C_CAR
 HIS ND1 N_AMD
 HIS CD2 C_CAR
 HIS NE2 N_AMD
 HIS CE1 C_CAR
 ILE CG1 C_ALI
 ILE CG2 C_ALI
 ILE CD1 C_ALI
 LEU CG  C_ALI
 LEU CD1 C_ALI
 LEU CD2 C_ALI
 LYS CG  C_ALI
 LYS CD  C_ALI
 LYS CE  C_ALI
 LYS NZ  N_AMN
 MET CG  C_ALI
 MET SD  S
 MET CE  C_ALI
 PHE CG  C_CAR
 PHE CD1 C_CAR
 PHE CD2 C_CAR
 PHE CE1 C_CAR
 PHE CE2 C_CAR
 PHE CZ  C_CAR
 PRO CG  C_ALI
 PRO CD  C_ALI
 SEC SE  SE
 SER OG  O
 THR OG1 O
 THR CG2 C_ALI
 TRP CG  C_CAR
 TRP CD1 C_CAR
 TRP CD2 C_CAR
 TRP NE1 N_AMD
 TRP CE2 C_CAR
 TRP CE3 C_CAR
 TRP CZ2 C_CAR
 TRP CZ3 C_CAR
 TRP CH2 C_CAR
 TYR CG  C_CAR
 TYR CD1 C_CAR
 TYR CD2 C_CAR
 TYR CE1 C_CAR
 TYR CE2 C_CAR
 TYR CZ  C_CAR
 TYR OH  O
 VAL CG1 C_ALI
 VAL CG2 C_ALI
 A N9 N_NUC
 A C8 C_NUC
 A N7 N_NUC
 A C5 C_NUC
 A C6 C_NUC
 A N6 N_NUC
 A N1 N_NUC
 A C2 C_NUC
 A N3 N_NUC
 A C4 C_NUC
 C N1 N_NUC
 C C2 C_NUC
 C O2 O
 C N3 N_NUC
 C C4 C_NUC
 C N4 N_NUC
 C C5 C_NUC
 C C6 C_NUC
 G N9 N_NUC
 G C8 C_NUC
 G N7 N_NUC
 G C5 C_NUC
 G C6 C_NUC
 G O6 O
 G N1 N_NUC
 G C2 C_NUC
 G N2 N_NUC
 G N3 N_NUC
 G C4 C_NUC
 I N9 N_NUC
 I C8 C_NUC
 I N7 N_NUC
 I C5 C_NUC
 I C6 C_NUC
 I O6 O
 I N1 N_NUC
 I C2 C_NUC
 I N3 N_NUC
 I C4 C_NUC
 T N1 N_NUC
 T C2 C_NUC
 T O2 O
 T N3 N_NUC
 T C4 C_NUC
 T O4 O
 T C5 C_NUC
 T C7 C_NUC
 T C6 C_NUC
 U N1 N_NUC
 U C2 C_NUC
 U O2 O
 U N3 N_NUC
 U C4 C_NUC
 U O4 O
 U C5 C_NUC
 U C6 C_NUC
 DA N9 N_NUC
 DA C8 C_NUC
 DA N7 N_NUC
 DA C5 C_NUC
 DA C6 C_NUC
 DA N6 N_NUC
 DA N1 N_NUC
 DA C2 C_NUC
 DA N3 N_NUC
 DA C4 C_NUC
 DC N1 N_NUC
 DC C2 C_NUC
 DC O2 O
 DC N3 N_NUC
 DC C4 C_NUC
 DC N4 N_NUC
 DC C5 C_NUC
 DC C6 C_NUC
 DG N9 N_NUC
 DG C8 C_NUC
 DG N7 N_NUC
 DG C5 C_NUC
 DG C6 C_NUC
 DG O6 O
 DG N1 N_NUC
 DG C2 C_NUC
 DG N2 N_NUC
 DG N3 N_NUC
 DG C4 C_NUC
 DI N9 N_NUC
 DI C8 C_NUC
 DI N7 N_NUC
 DI C5 C_NUC
 DI C6 C_NUC
 DI O6 O
 DI N1 N_NUC
 DI C2 C_NUC
 DI N3 N_NUC
 DI C4 C_NUC
 DT N1 N_NUC
 DT C2 C_NUC
 DT O2 O
 DT N3 N_NUC
 DT C4 C_NUC
 DT O4 O
 DT C5 C_NUC
 DT C7 C_NUC
 DT C6 C_NUC
 DU N1 N_NUC
 DU C2 C_NUC
 DU O2 O
 DU N3 N_NUC
 DU C4 C_NUC
 DU O4 O
 DU C5 C_NUC
 DU C6 C_NUC
--- a/src/prodigy_prot/modules/init.py
+++ b/src/prodigy_prot/modules/init.py
--- a/src/prodigy_prot/modules/aa_properties.py
+++ b/src/prodigy_prot/modules/aa_properties.py
@@ -0,0 +1,148 @@
 """
 Generic properties of amino acids required for the binding affinity
 prediction methods.
 """
 aa_character_ic: dict[str, str] = {
    "ALA": "A",
    "CYS": "A",  # ?
    "GLU": "C",
    "ASP": "C",
    "GLY": "A",
    "PHE": "A",
    "ILE": "A",
    "HIS": "C",
    "LYS": "C",
    "MET": "A",
    "LEU": "A",
    "ASN": "P",
    "GLN": "P",
    "PRO": "A",
    "SER": "P",
    "ARG": "C",
    "THR": "P",
    "TRP": "A",
    "VAL": "A",
    "TYR": "A",
 }
 aa_character_protorp: dict[str, str] = {
    "ALA": "A",
    "CYS": "P",
    "GLU": "C",
    "ASP": "C",
    "GLY": "A",
    "PHE": "A",
    "ILE": "A",
    "HIS": "P",
    "LYS": "C",
    "MET": "A",
    "LEU": "A",
    "ASN": "P",
    "GLN": "P",
    "PRO": "A",
    "SER": "P",
    "ARG": "C",
    "THR": "P",
    "TRP": "P",
    "VAL": "A",
    "TYR": "P",
 }
 # Taken from pre-original prodigy code
 # B for hydrophoBic
 # Y for hydrophiLic
 aa_character_hydro: dict[str, str] = {
    "ALA": "B", #+
    "CYS": "B", #+
    "GLU": "L", #+
    "ASP": "L", #+
    "GLY": "L", # Glycine was B in my initial classification
    "PHE": "B", #+
    "ILE": "B", #+
    "HIS": "L", #+
    "LYS": "L", #+
    "MET": "B", #+
    "LEU": "B", #+
    "ASN": "L", #+
    "GLN": "L", #+
    "PRO": "L", # Proline was B my initial classification
    "SER": "L", #+
    "ARG": "L", #+
    "THR": "L", #+
    "TRP": "L", #+
    "VAL": "B", #+
    "TYR": "L", #+
 }
 # Scaling factors for relative ASA
 # Calculated using extended ALA-X-ALA peptides
 # Taken from NACCESS
 rel_asa: dict[str, dict[str, float]] = {
    "total": {
        "ALA": 107.95,
        "CYS": 134.28,
        "ASP": 140.39,
        "GLU": 172.25,
        "PHE": 199.48,
        "GLY": 80.10,
        "HIS": 182.88,
        "ILE": 175.12,
        "LYS": 200.81,
        "LEU": 178.63,
        "MET": 194.15,
        "ASN": 143.94,
        "PRO": 136.13,
        "GLN": 178.50,
        "ARG": 238.76,
        "SER": 116.50,
        "THR": 139.27,
        "VAL": 151.44,
        "TRP": 249.36,
        "TYR": 212.76,
    },
    "bb": {
        "ALA": 38.54,
        "CYS": 37.53,
        "ASP": 37.70,
        "GLU": 37.51,
        "PHE": 35.37,
        "GLY": 47.77,
        "HIS": 35.80,
        "ILE": 37.16,
        "LYS": 37.51,
        "LEU": 37.51,
        "MET": 37.51,
        "ASN": 37.70,
        "PRO": 16.23,
        "GLN": 37.51,
        "ARG": 37.51,
        "SER": 38.40,
        "THR": 37.57,
        "VAL": 37.16,
        "TRP": 38.10,
        "TYR": 35.38,
    },
    "sc": {
        "ALA": 69.41,
        "CYS": 96.75,
        "ASP": 102.69,
        "GLU": 134.74,
        "PHE": 164.11,
        "GLY": 32.33,
        "HIS": 147.08,
        "ILE": 137.96,
        "LYS": 163.30,
        "LEU": 141.12,
        "MET": 156.64,
        "ASN": 106.24,
        "PRO": 119.90,
        "GLN": 140.99,
        "ARG": 201.25,
        "SER": 78.11,
        "THR": 101.70,
        "VAL": 114.28,
        "TRP": 211.26,
        "TYR": 177.38,
    },
 }
--- a/src/prodigy_prot/modules/freesasa_tools.py
+++ b/src/prodigy_prot/modules/freesasa_tools.py
@@ -0,0 +1,71 @@
 """
 Functions to execute freesasa and parse its output.
 """
 import os
 import freesasa
 from Bio.PDB.Model import Model
 from Bio.PDB.Structure import Structure
 from freesasa import Classifier, calc, structureFromBioPDB
 from prodigy_prot import NACCESS_CONFIG
 from prodigy_prot.modules.aa_properties import rel_asa
 freesasa.setVerbosity(freesasa.nowarnings)
 def execute_freesasa_api(model: Model) -> tuple[dict, dict]:
    """
    Calls freesasa using its Python API and returns
    per-residue accessibilities.
    """
    asa_data = {}
    rsa_data: dict[tuple[str, int, str], float] = {}
    _rsa: dict = rel_asa["total"]
    classifier = Classifier(str(NACCESS_CONFIG))
    # NOTE: `structureFromBioPDB` requires a Structure object
    #  so here build one from a model
    s = Structure(model.id)
    s.add(model)
    try:
        struct = structureFromBioPDB(
            s,
            classifier,
        )
        result = calc(struct)
    except AssertionError as e:
        error_message = "" + os.linesep
        error_message += "[!] Error when running freesasa:" + os.linesep
        error_message += f"[!] {e}" + os.linesep
        error_message += (
            "[!] Make sure the atom names in your PDB file match"
            " the canonical naming and belong "
            "to default residues" + os.linesep
        )
        print(error_message)
        raise Exception(error_message)
    # iterate over all atoms to get SASA and residue name
    for idx in range(struct.nAtoms()):
        atname = struct.atomName(idx)
        resname = struct.residueName(idx)
        resid = struct.residueNumber(idx)
        chain = struct.chainLabel(idx)
        at_uid = (chain, resname, resid, atname)
        res_uid = (chain, resname, resid)
        asa = result.atomArea(idx)
        asa_data[at_uid] = asa
        # add asa to residue
        rsa_data[res_uid] = rsa_data.get(res_uid, 0) + asa
    # convert total asa ro relative asa
    rsa_data.update(
        (res_uid, asa / _rsa[res_uid[1]]) for res_uid, asa in rsa_data.items()
    )
    return asa_data, rsa_data
--- a/src/prodigy_prot/modules/models.py
+++ b/src/prodigy_prot/modules/models.py
@@ -0,0 +1,41 @@
 """
 Models to predict binding affinity based on molecular properties.
 """
 def IC_NIS(
    ic_cc: float,
    ic_ca: float,
    ic_pp: float,
    ic_pa: float,
    p_nis_a: float,
    p_nis_c: float,
 ) -> float:
    """
    Calculates the predicted binding affinity value
    based on the IC-NIS model.
    """
    return (
        -0.09459 * ic_cc
        + -0.10007 * ic_ca
        + 0.19577 * ic_pp
        + -0.22671 * ic_pa
        + 0.18681 * p_nis_a
        + 0.13810 * p_nis_c
        + -15.9433
    )
 def NIS(p_nis_c: float, p_nis_p: float, n_int_atoms: float) -> float:
    """
    Calculates the predicted binding affinity value
    based on the NIS model.
    """
    return (
        0.0856851248873 * p_nis_p
        + -0.0685254498746 * p_nis_c
        + 0.0261591389985 * n_int_atoms
        + 3.0124939659498
    )
--- a/src/prodigy_prot/modules/parsers.py
+++ b/src/prodigy_prot/modules/parsers.py
@@ -0,0 +1,187 @@
 """
 Functions to read PDB/mmCIF files
 """
 import logging
 import sys
 import typing
 import warnings
 from pathlib import Path
 from typing import Optional, Union
 from Bio.PDB.Atom import DisorderedAtom
 from Bio.PDB.Chain import Chain
 from Bio.PDB.MMCIFParser import MMCIFParser
 from Bio.PDB.Model import Model
 from Bio.PDB.PDBExceptions import PDBConstructionWarning
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.Polypeptide import PPBuilder, is_aa
 from Bio.PDB.Structure import Structure
 warnings.filterwarnings("ignore", category=PDBConstructionWarning)
 log = logging.getLogger("Prodigy")
 def get_parser(input_f: Path) -> Union[PDBParser, MMCIFParser]:
    if input_f.suffix == ".cif":
        return MMCIFParser()
    else:
        return PDBParser()
 def ignore(r):
    return r.id[0][0] == "W" or r.id[0][0] == "H"
 def validate_structure(
    input_strcture_obj: Structure,
    selection: Optional[list[str]] = None,
    clean: bool = True,
 ) -> list[Model]:
    result: list[Model] = []
    for model in [m for m in input_strcture_obj.child_list]:
        # process selected chains
        chains: list[Chain] = list(model.get_chains())
        chain_ids = set([c.id for c in chains])
        if selection:
            sel_chains = []
            # Match selected chain with structure
            for sel in selection:
                for c_str in sel.split(","):
                    sel_chains.append(c_str)
                    if c_str not in chain_ids:
                        raise ValueError(
                            f"Selected chain not present in provided structure: {c_str}"
                        )
            # Remove unselected chains
            def _ignore_helper(x) -> bool:
                return x.id not in sel_chains
            for c in chains:
                if _ignore_helper(c):
                    if c.parent is not None:
                        c.parent.detach_child(c.id)
        # Double occupancy check
        for atom in list(model.get_atoms()):
            if atom.is_disordered():
                atom = typing.cast(DisorderedAtom, atom)
                residue = atom.parent
                assert residue is not None
                sel_at = atom.selected_child
                assert sel_at is not None
                sel_at.altloc = " "
                sel_at.disordered_flag = 0
                residue.detach_child(atom.id)
                residue.add(sel_at)
        # Insertion code check
        for c in chains:
            for residue in c.get_residues():
                if residue.get_id()[2] != " ":
                    c.detach_child(residue.id)
        if clean:
            # Remove HETATMs and solvent
            res_list = list(model.get_residues())
            for res in res_list:
                if ignore(res):
                    chain = res.parent
                    assert chain is not None
                    chain.detach_child(res.id)
                elif not is_aa(res, standard=True):
                    raise ValueError(
                        "Unsupported non-standard amino acid found: {0}".format(
                            res.resname
                        )
                    )
            # Remove Hydrogens
            atom_list = list(model.get_atoms())
            def _ignore(x):
                return x.element == "H"
            for atom in atom_list:
                if _ignore(atom):
                    residue = atom.parent
                    assert residue is not None
                    residue.detach_child(atom.name)
        # Detect gaps and compare with no. of chains
        pep_builder = PPBuilder()
        peptides = pep_builder.build_peptides(model)
        n_peptides = len(peptides)
        if n_peptides != len(chain_ids):
            message = "[!] Structure contains gaps:\n"
            for i_pp, pp in enumerate(peptides):
                message += (
                    "\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > "
                    "{2.parent.id} {2.resname}{2.id[1]}\n".format(i_pp, pp[0], pp[-1])
                )
            log.warning(message)
        result.append(model)
    return result
 def parse_structure(path: str) -> tuple[list[Model], int, int]:
    """Return a validated `Structure`, number of chains and number of residues"""
    extension = Path(path).suffix
    supported_extensions = [".pdb", ".cif", ".ent"]
    if extension not in supported_extensions:
        log.error(
            f"[!] Structure format '{extension}' is "
            "not supported. Use '.pdb' or '.cif'."
        )
        sys.exit(1)
    parser = get_parser(Path(path))
    structure_name = Path(path).stem
    structure_path = Path(path)
    try:
        original_structure = parser.get_structure(structure_name, structure_path)
    except Exception as e:
        log.exception(e)
        sys.exit(1)
    assert isinstance(original_structure, Structure)
    models: list[Model] = validate_structure(original_structure)
    # Get number of chains
    chain_dict = {}
    res_dict = {}
    for model in models:
        chain_dict.update({c.id: c for c in model.get_chains()})
        res_dict.update({r.id: r for r in model.get_residues()})
    ## Make sure all models have the same chains
    # Get chain sets for all models
    chain_sets = [set(chain.id for chain in model.get_chains()) for model in models]
    # Check if all sets are identical
    if not all(chain_set == chain_sets[0] for chain_set in chain_sets):
        raise ValueError(
            "Not all models have the same chains. Found chain sets: "
            + ", ".join(str(s) for s in chain_sets)
        )
    res_sets = [set(res.id for res in model.get_residues()) for model in models]
    if not all(res_set == res_sets[0] for res_set in res_sets):
        raise ValueError(
            "Not all models have the same residues. Found residue sets: "
            + ", ".join(str(s) for s in res_sets)
        )
    # structure, n_chains, n_res = parse_structure(path=str(struct_path))
    return (models, len(chain_sets[0]), len(res_sets[0]))
--- a/src/prodigy_prot/modules/prodigy.py
+++ b/src/prodigy_prot/modules/prodigy.py
@@ -0,0 +1,301 @@
 import sys
 from io import TextIOWrapper
 from typing import Optional, TextIO, Union
 from Bio.PDB.Model import Model
 from Bio.PDB.NeighborSearch import NeighborSearch
 #from Bio.PDB.Structure import Structure
 from prodigy_prot.modules import aa_properties
 from prodigy_prot.modules.freesasa_tools import execute_freesasa_api
 from prodigy_prot.modules.models import IC_NIS
 from prodigy_prot.modules.utils import dg_to_kd
 def calculate_ic(
    model: Model, d_cutoff: float = 5.5, selection: Optional[dict[str, int]] = None
 ) -> list:
    """
    Calculates intermolecular contacts in a parsed struct object.
    """
    atom_list = list(model.get_atoms())
    ns = NeighborSearch(atom_list)
    all_list = ns.search_all(radius=d_cutoff, level="R")
    assert all_list is not None
    if selection:
        _sd = selection
        def _chain(x):
            return x.parent.id
        ic_list = [
            c
            for c in all_list
            if (_chain(c[0]) in _sd and _chain(c[1]) in _sd)
            and (_sd[_chain(c[0])] != _sd[_chain(c[1])])
        ]
    else:
        ic_list = [c for c in all_list if c[0].parent.id != c[1].parent.id]
    if not ic_list:
        raise ValueError("No contacts found for selection")
    ic_list.sort()
    return ic_list
 def analyse_contacts(contact_list: list) -> dict[str, float]:
    """
    Enumerates and classifies contacts based on the chemical characteristics
    of the participating amino acids.
    """
    bins = {
        "AA": 0.0,
        "PP": 0.0,
        "CC": 0.0,
        "AP": 0.0,
        "CP": 0.0,
        "AC": 0.0,
        "LL": 0.0,
        "BL": 0.0,
        "BB": 0.0 
    }
    _data = aa_properties.aa_character_ic
    for res_i, res_j in contact_list:
        i = _data.get(res_i.resname)
        j = _data.get(res_j.resname)
        if i is not None and j is not None:
            contact_type = "".join(sorted((i, j)))
            bins[contact_type] += 1
    _data = aa_properties.aa_character_hydro
    for res_i, res_j in contact_list:
        i = _data.get(res_i.resname)
        j = _data.get(res_j.resname)
        if i is not None and j is not None:
            contact_type = "".join(sorted((i, j)))
            bins[contact_type] += 1
    return bins
 def analyse_nis(sasa_dict: dict, acc_threshold: float = 0.05) -> list[float]:
    """
    Returns the percentages of apolar, polar, and charged
    residues at the interface, according to an accessibility
    criterion.
    """
    _data = aa_properties.aa_character_protorp
    def _char_to_index(x):
        return {"A": 0, "C": 1, "P": 2}.get(x)
    count = [0, 0, 0]
    for res, rsa in sasa_dict.items():
        _, resn, _ = res
        if rsa >= acc_threshold:
            aa_character = _data[resn]
            aa_index = _char_to_index(aa_character)
            assert aa_index is not None
            count[aa_index] += 1
    percentages = [100.0 * x / sum(count) for x in count]
    return percentages
 class Prodigy:
    # init parameters
    def __init__(
        self,
        model: Model,
        name: str = "",
        selection: Optional[list[str]] = None,
        temp: float = 25.0,
    ):
        self.temp = float(temp)
        if selection is None:
            self.selection = [chain.id for chain in model.get_chains()]
        else:
            self.selection = selection
        self.model = model
        self.name = name
        self.ic_network: list = []
        self.bins: dict[str, float] = {
            "CC": 0.0,
            "CP": 0.0,
            "AC": 0.0,
            "PP": 0.0,
            "AP": 0.0,
            "AA": 0.0,
            "LL": 0.0,
            "BL": 0.0,
            "BB": 0.0
        }
        self.nis_a = 0.0
        self.nis_c = 0.0
        self.nis_p = 0.0
        self.ba_val = 0.0
        self.kd_val = 0.0
    def predict(
        self,
        temp: Optional[float] = None,
        distance_cutoff: float = 5.5,
        acc_threshold: float = 0.05,
    ):
        if temp is not None:
            self.temp = temp
        # Make selection dict from user option or PDB chains
        selection_dict: dict[str, int] = {}
        for igroup, group in enumerate(self.selection):
            chains = group.split(",")
            for chain in chains:
                if chain in selection_dict:
                    errmsg = "Selections must be disjoint sets: " f"{chain} is repeated"
                    raise ValueError(errmsg)
                selection_dict[chain] = igroup
        # Contacts
        self.ic_network = calculate_ic(
            self.model, d_cutoff=distance_cutoff, selection=selection_dict
        )
        self.bins = analyse_contacts(self.ic_network)
        # SASA
        _, cmplx_sasa = execute_freesasa_api(self.model)
        self.nis_a, self.nis_c, self.nis_p = analyse_nis(cmplx_sasa, acc_threshold=acc_threshold)
        # Affinity Calculation
        self.ba_val = IC_NIS(
            self.bins["CC"],
            self.bins["AC"],
            self.bins["PP"],
            self.bins["AP"],
            self.nis_a,
            self.nis_c,
        )
        self.kd_val = dg_to_kd(self.ba_val, self.temp)
    def as_dict(self) -> dict:
        return_dict = {
            "model": self.model.id,
            "selection": self.selection,
            "temp": self.temp,
            "ICs": len(self.ic_network),
            "nis_a": self.nis_a,
            "nis_c": self.nis_c,
            "nis_p": self.nis_p,
            "ba_val": self.ba_val,
            "kd_val": self.kd_val,
        }
        return_dict.update(self.bins)
        return return_dict
    def print_prediction(self, outfile: str = "", quiet: bool = False, showall: bool = False) -> None:
        handle: Union[TextIOWrapper, TextIO]
        if outfile:
            handle = open(outfile, "w")
        else:
            handle = sys.stdout
        if quiet:
            handle.write("{0}\t{1:8.3f}\n".format(self.name, self.ba_val))
        else:
            # Collect output lines in order
            lines = []
            lines.append(f"[+] No. of intermolecular contacts: {len(self.ic_network)}\n")
            lines.append(f"[+] No. of Charged-Charged contacts: {self.bins['CC']}\n")
            lines.append(f"[+] No. of Charged-Polar contacts: {self.bins['CP']}\n")
            lines.append(f"[+] No. of Charged-Apolar contacts: {self.bins['AC']}\n")
            lines.append(f"[+] No. of Polar-Polar contacts: {self.bins['PP']}\n")
            lines.append(f"[+] No. of Apolar-Polar contacts: {self.bins['AP']}\n")
            lines.append(f"[+] No. of Apolar-Apolar contacts: {self.bins['AA']}\n")
            if showall:
                lines.append(f"[+] No. of hydrophiLic-hydrophiLic contacts: {self.bins['LL']}\n")
                lines.append(f"[+] No. of hydrophoBic-hydrophiLic contacts: {self.bins['BL']}\n")
                lines.append(f"[+] No. of hydrophoBic-hydrophoBic contacts: {self.bins['BB']}\n")
                lines.append(f"[+] Percentage of Polar NIS residues: {self.nis_p:3.2f}\n")
            lines.append(f"[+] Percentage of Apolar NIS residues: {self.nis_a:3.2f}\n")
            lines.append(f"[+] Percentage of Charged NIS residues: {self.nis_c:3.2f}\n")
            lines.append(f"[++] predicted binding affinity (kcal.mol-1): {self.ba_val:8.1f}\n")
            lines.append(f"[++] predicted dissociation constant (M) at {self.temp:.1f}˚C: {self.kd_val:8.1e}\n")
            handle.writelines(lines)
        if handle is not sys.stdout:
            handle.close()
    def print_contacts(self, outfile: str = "") -> None:
        handle: Union[TextIOWrapper, TextIO]
        if outfile:
            handle = open(outfile, "w")
        else:
            handle = sys.stdout
        for res1, res2 in self.ic_network:
            _fmt_str = (
                "{0.resname:>5s} {0.id[1]:5} {0.parent.id:>3s} {1.resname:>5s}"
                " {1.id[1]:5} {1.parent.id:>3s}\n"
            )
            if res1.parent.id not in self.selection[0]:
                res1, res2 = res2, res1
            handle.write(_fmt_str.format(res1, res2))
        if handle is not sys.stdout:
            handle.close()
    def print_pymol_script(self, outfile: str = "") -> None:
        # Writing output PYMOL: pml script
        # initialize array with chains and save chain selection string
        selection_strings = []
        chains: dict[str, set] = {}
        for s in self.selection:
            selection_strings.append(s.replace(",", "+"))
            for c in s.split(","):
                chains[c] = set()
        # loop over pairs and add interface residues to respective chains
        for pair in self.ic_network:
            for r in pair:
                chains[r.parent.id].add(str(r.id[1]))
        # set output stream
        handle = open(outfile, "w") if outfile else sys.stdout
        # write default setup strings
        handle.writelines(
            [
                "color silver\n",
                "as cartoon\n",
                "bg_color white\n",
                "center\n",
                "color lightblue, chain {}\n".format(selection_strings[0]),
                "color lightpink, chain {}\n".format(selection_strings[1]),
            ]
        )
        # loop over interfaces construct selection strings
        #  and write interface related commands
        for color, iface in [("blue", 1), ("hotpink", 2)]:
            p_sel_string = " or ".join(
                [
                    "chain {} and resi {}".format(c, "+".join(chains[c]))
                    for c in selection_strings[iface - 1].split("+")
                ]
            )
            handle.write("select iface{},  {}\n".format(iface, p_sel_string))
            handle.write("color {}, iface{}\n".format(color, iface))
            handle.write("show sticks, iface{}\n".format(iface))
        # close file handle if applicable
        if handle is not sys.stdout:
            handle.close()
--- a/src/prodigy_prot/modules/utils.py
+++ b/src/prodigy_prot/modules/utils.py
@@ -0,0 +1,25 @@
 """
 Assorted utility functions.
 """
 import math
 import os
 def check_path(path: str) -> str:
    """
    Checks if a file is readable.
    """
    full_path = os.path.abspath(path)
    if not os.path.isfile(full_path):
        raise IOError("Could not read file: {0}".format(path))
    return full_path
 def dg_to_kd(dg: float, temperature: float = 25.0) -> float:
    """Coversion of DG into the dissociation constant kd"""
    temp_in_k = temperature + 273.15
    rt = 0.0019858775 * temp_in_k
    return math.exp(dg / rt)
--- a/tests/init.py
+++ b/tests/init.py
@@ -0,0 +1,3 @@
 from pathlib import Path
 TEST_DATA = Path(Path(__file__).parents[0], "test_data")
--- a/tests/test_data/2oob.cif
+++ b/tests/test_data/2oob.cif
--- a/tests/test_data/2oob.pdb
+++ b/tests/test_data/2oob.pdb
--- a/tests/test_data/dataset.json
+++ b/tests/test_data/dataset.json
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -0,0 +1,78 @@
 from pathlib import Path
 import pytest
 from Bio.PDB.MMCIFParser import MMCIFParser
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.Structure import Structure
 from prodigy_prot.modules.parsers import get_parser, parse_structure, validate_structure
 from . import TEST_DATA
@pytest.fixture
 def input_structure_cif():
    yield Path(TEST_DATA, "2oob.cif")
@pytest.fixture
 def input_structure_pdb() -> Path:
    return Path(TEST_DATA, "2oob.pdb")
 def test_get_parser_pdb(input_structure_pdb):
    parser = get_parser(input_structure_pdb)
    assert isinstance(parser, PDBParser)
 def test_get_parser_cif(input_structure_cif):
    parser = get_parser(input_structure_cif)
    assert isinstance(parser, MMCIFParser)
 def test_validate_structure_pdb(input_structure_pdb):
    parser = PDBParser()
    structure = parser.get_structure("test_structure", input_structure_pdb)
    assert isinstance(structure, Structure)
    result = validate_structure(structure)
    assert result == structure.child_list
 def test_validate_structure_cif(input_structure_cif):
    parser = MMCIFParser()
    structure = parser.get_structure("test_structure", input_structure_cif)
    assert isinstance(structure, Structure)
    result = validate_structure(structure)
    assert result == structure.child_list
 def test_parse_structure_pdb(input_structure_pdb):
    parser = PDBParser()
    structure = parser.get_structure(input_structure_pdb.stem, input_structure_pdb)
    assert isinstance(structure, Structure)
    result, num_chains, num_res = parse_structure(input_structure_pdb)
    assert result == structure.child_list
    assert num_chains == 2
    assert num_res == 116
 def test_parse_structure_cif(input_structure_cif):
    parser = MMCIFParser()
    structure = parser.get_structure(input_structure_cif.stem, input_structure_cif)
    assert isinstance(structure, Structure)
    result, num_chains, num_res = parse_structure(input_structure_cif)
    assert result == structure.child_list
    assert num_chains == 2
    assert num_res == 116
--- a/tests/test_prodigy.py
+++ b/tests/test_prodigy.py
@@ -0,0 +1,239 @@
 import json
 import tarfile
 import tempfile
 from io import BufferedReader, TextIOWrapper
 from os.path import basename, splitext
 from pathlib import Path
 import pytest
 from Bio.PDB.Model import Model
 from Bio.PDB.PDBParser import PDBParser
 from Bio.PDB.Residue import Residue
 from Bio.PDB.Structure import Structure
 from prodigy_prot.modules.parsers import validate_structure
 from prodigy_prot.modules.prodigy import (
    Prodigy,
    analyse_contacts,
    analyse_nis,
    calculate_ic,
 )
 from . import TEST_DATA
@pytest.fixture
 def input_model():
    input_f = Path(TEST_DATA, "2oob.pdb")
    parser = PDBParser()
    structure = parser.get_structure(input_f.stem, input_f)
    assert isinstance(structure, Structure)
    return structure.child_list[0]
@pytest.fixture
 def compressed_dataset_f():
    return Path(TEST_DATA, "dataset.tgz")
@pytest.fixture
 def expected_dataset_json():
    return Path(TEST_DATA, "dataset.json")
@pytest.fixture
 def prodigy_class(input_model):
    yield Prodigy(input_model)
 def test_calculate_ic(input_model):
    result = calculate_ic(model=input_model, d_cutoff=5.5)
    assert len(result) == 78
    first_hit: tuple[Residue, Residue] = result[0]
    assert first_hit[0].get_resname() == "ASN"
    assert first_hit[1].get_resname() == "LYS"
 def test_calculate_ic_with_selection(input_model):
    result = calculate_ic(model=input_model, d_cutoff=5.5, selection={"A": 0, "B": 1})
    assert len(result) == 78
    first_hit: tuple[Residue, Residue] = result[0]
    assert first_hit[0].get_resname() == "ASN"
    assert first_hit[1].get_resname() == "LYS"
 def test_analyse_contacts(input_model):
    res_a = input_model["A"][(" ", 931, " ")]
    res_b = input_model["B"][(" ", 6, " ")]
    contact = (res_a, res_b)
    test_input = [contact]
    result = analyse_contacts(test_input)
    expected_output = {
        "AA": 0.0,
        "PP": 0.0,
        "CC": 0.0,
        "AP": 0.0,
        "CP": 1.0,
        "AC": 0.0,
        "LL": 1.0,
        "BL": 0.0,
        "BB": 0.0
    }
    assert result == expected_output
 def test_analyse_nis():
    test_input = {("B", "ARG", "72"): 0.9}
    apolar, polar, charged = analyse_nis(test_input)
    assert apolar == 0.0
    assert polar == 100.0
    assert charged == 0.0
 def test_prodigy_predict(prodigy_class):
    prodigy_class.predict()
    assert prodigy_class.nis_a == pytest.approx(35.5, abs=1.0)
    assert prodigy_class.nis_c == pytest.approx(38.0, abs=1.0)
    assert prodigy_class.ba_val == pytest.approx(-6.2, abs=1.0)
    # This is the actual prediction
    assert prodigy_class.kd_val == pytest.approx(2.7e-5, abs=1e-6)
 def test_prodigy_as_dict(prodigy_class):
    result = prodigy_class.as_dict()
    assert isinstance(result, dict)
    # 14 'original' + 3 hydro + 1 %NIS
    assert len(result) == 18
 def test_prodigy_print_prediction(prodigy_class):
    outfile = tempfile.NamedTemporaryFile(delete=False)
    assert Path(outfile.name).stat().st_size == 0
    prodigy_class.print_prediction(outfile.name)
    assert Path(outfile.name).stat().st_size != 0
    Path(outfile.name).unlink()
 def test_prodigy_print_prediction_quiet(prodigy_class):
    outfile = tempfile.NamedTemporaryFile(delete=False)
    assert Path(outfile.name).stat().st_size == 0
    prodigy_class.print_prediction(outfile.name, True)
    assert Path(outfile.name).stat().st_size != 0
    Path(outfile.name).unlink()
 def test_prodigy_print_contacts(input_model, prodigy_class):
    res_a = input_model["A"][(" ", 931, " ")]
    res_b = input_model["B"][(" ", 6, " ")]
    prodigy_class.ic_network = [(res_a, res_b)]
    outfile = tempfile.NamedTemporaryFile(delete=False)
    assert Path(outfile.name).stat().st_size == 0
    prodigy_class.print_contacts(outfile.name)
    assert Path(outfile.name).stat().st_size != 0
    Path(outfile.name).unlink()
 def test_print_pymol_script(input_model, prodigy_class):
    res_a = input_model["A"][(" ", 931, " ")]
    res_b = input_model["B"][(" ", 6, " ")]
    prodigy_class.ic_network = [(res_a, res_b)]
    outfile = tempfile.NamedTemporaryFile(delete=False)
    assert Path(outfile.name).stat().st_size == 0
    prodigy_class.print_pymol_script(outfile.name)
    assert Path(outfile.name).stat().st_size != 0
    Path(outfile.name).unlink()
@pytest.mark.integration
 def test_dataset_prediction(compressed_dataset_f, expected_dataset_json):
    """
    Test method to compare prediction for 80 dataset cases with
        expected values.
    """
    # load expected data from json
    with open(expected_dataset_json) as fh:
        expected_data = json.load(fh)
    # load dataset PDBs
    dataset = tarfile.open(compressed_dataset_f)
    parser = PDBParser(QUIET=True)
    keys_equal = ["AA", "PP", "CC", "AP", "CP", "AC"]
    diffs = {"ba_val": [], "nis_a": [], "nis_c": []}
    # run prodigy for each dataset in the PDB
    for entry in dataset:
        s_name, s_ext = splitext(basename(entry.name))
        # skip system files in archive
        if not s_name.isalnum() or s_ext != ".pdb":
            continue
        handle = dataset.extractfile(entry)
        # Wrap filehandle to ensure string file handle in Python 3
        handle = TextIOWrapper(BufferedReader(handle))  # type: ignore
        parsed_structure = parser.get_structure(s_name, handle)
        assert isinstance(parsed_structure, Structure)
        models = validate_structure(parsed_structure, selection=["A", "B"])
        # Test for structure object
        # Check if it's a list and all elements are Model objects
        assert isinstance(models, list) and all(
            isinstance(item, Model) for item in models
        )
        # assert isinstance(s, list[Model])
        #  run prediction and retrieve result dict
        for m in models:
            prod = Prodigy(m, selection=["A", "B"])
            prod.predict()
            results = prod.as_dict()
            # check for equality of prdicted interface residues
            for k in keys_equal:
                observed_value = results[k]
                expected_value = expected_data[s_name][k]
                assert observed_value == pytest.approx(expected_value)
            # check that NIS and binding afinity values are within 2% of
            #  expected values and add diffs for summary
            for k in diffs.keys():
                delta = abs(results[k] / expected_data[s_name][k] - 1)
                # assume a difference of less then 2%
                assert delta == pytest.approx(0, abs=0.02)
                diffs[k].append(delta)
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -0,0 +1,21 @@
 import math
 import tempfile
 from pathlib import Path
 from prodigy_prot.modules.utils import check_path, dg_to_kd
 def test_check_path():
    temp_f = tempfile.NamedTemporaryFile(delete=False)
    result = check_path(temp_f.name)
    assert result == temp_f.name
    Path(temp_f.name).unlink()
 def test_dg_to_kd():
    assert math.isclose(dg_to_kd(0.0), 1.0, rel_tol=1e-9)
		`@@ -0,0 +1,3 @@`
							`include README.md`
							`include src/prodigy_prot/data/naccess.config`
		`@@ -0,0 +1,3 @@`
							`from pathlib import Path`

							`NACCESS_CONFIG = Path(Path(__file__).parents[0], "data/naccess.config")`
		`@@ -0,0 +1,3 @@`
							`from pathlib import Path`

							`TEST_DATA = Path(Path(__file__).parents[0], "test_data")`