Merge pull request #148 from speleo3/fix-get_pi

Refactor and fix MolecularContainer.get_pi()
2022-08-08 19:52:36 -07:00
parent ab80128c45 80c7bf07cd
commit 8acfb4fe1b
4 changed files with 126 additions and 107 deletions
--- a/propka/molecular_container.py
+++ b/propka/molecular_container.py
@@ -15,12 +15,6 @@ from propka.lib import make_grid
 _LOGGER = logging.getLogger(__name__)
 # TODO - these are constants whose origins are a little murky
 UNK_PI_CUTOFF = 0.01
 # Maximum number of iterations for finding PI
 MAX_ITERATION = 4
 class MolecularContainer:
    """Container for storing molecular contents of PDB files.
@@ -207,37 +201,38 @@ class MolecularContainer:
            charge_profile.append([ph, q_unfolded, q_folded])
        return charge_profile
-    def get_pi(self, conformation='AVR', grid=[0., 14., 1], iteration=0):
+    def get_pi(self, conformation='AVR', grid=[0., 14., 1], *,
            precision: float = 1e-4):
        """Get the isoelectric points for folded and unfolded states.
        Args:
            conformation:  conformation to test
            grid:  grid of pH values [min, max, step]
-            iteration:  iteration number of process
+            precision:  Compute pI up to this precision
        Returns:
            1. Folded state PI
            2. Unfolded state PI
        """
-        charge_profile = self.get_charge_profile(
+        conf = self.conformations[conformation]
-            conformation=conformation, grid=grid)
+
-        pi_folded = pi_unfolded = [None, 1e6, 1e6]
+        WHICH_UNFOLDED = 0
-        for point in charge_profile:
+        WHICH_FOLDED = 1
-            pi_folded = min(pi_folded, point, key=lambda v: abs(v[2]))
+
-            pi_unfolded = min(pi_unfolded, point, key=lambda v: abs(v[1]))
+        def pi(which, pH, min_, max_):
-        # If results are not good enough, do it again with a higher sampling
+            charge = conf.calculate_charge(
-        # resolution
+                self.version.parameters, ph=pH)[which]
-        pi_folded_value = pi_folded[0]
+            if max_ - min_ > precision:
-        pi_unfolded_value = pi_unfolded[0]
+                if charge > 0.0:
-        step = grid[2]
+                    min_ = pH
-        # TODO - need to warn if maximum number of iterations is exceeded
+                else:
-        if ((pi_folded[2] > UNK_PI_CUTOFF
+                    max_ = pH
-             or pi_unfolded[1] > UNK_PI_CUTOFF) and iteration < MAX_ITERATION):
+                next_pH = (min_ + max_) / 2
-            pi_folded_value, _ = self.get_pi(
+                return pi(which, next_pH, min_, max_)
-                conformation=conformation,
+            return pH
-                grid=[pi_folded[0]-step, pi_folded[0]+step, step/10.0],
+
-                iteration=iteration+1)
+        start = (grid[0] + grid[1]) / 2, grid[0], grid[1]
-            _, pi_unfolded_value = self.get_pi(
+
-                conformation=conformation,
+        return (
-                grid=[pi_unfolded[0]-step, pi_unfolded[0]+step, step/10.0],
+            pi(WHICH_FOLDED, *start),
-                iteration=iteration+1)
+            pi(WHICH_UNFOLDED, *start),
-        return pi_folded_value, pi_unfolded_value
+        )
--- a/tests/results/1HPX.dat
+++ b/tests/results/1HPX.dat
@@ -1,49 +0,0 @@
 5.07
 3.11
 4.62
 2.55
 9.28
 1.78
 4.91
 2.13
 4.78
 3.93
 3.65
 3.89
 4.73
 3.36
 4.07
 3.70
 2.08
 2.11
 6.98
 7.11
 9.41
 11.68
 9.82
 11.61
 9.67
 9.54
 10.43
 10.32
 11.41
 10.54
 10.42
 10.92
 10.55
 11.01
 11.43
 10.47
 10.41
 11.07
 13.96
 12.41
 14.39
 12.35
 12.76
 12.42
 13.73
 12.28
 8.96
 8.96
 4.60
--- a/tests/results/1HPX.json
+++ b/tests/results/1HPX.json
@@ -0,0 +1,55 @@
 {
  "pI_folded": 9.54,
  "pI_unfolded": 8.9,
  "pKa": [
    5.07,
    3.11,
    4.62,
    2.55,
    9.28,
    1.78,
    4.91,
    2.13,
    4.78,
    3.93,
    3.65,
    3.89,
    4.73,
    3.36,
    4.07,
    3.70,
    2.08,
    2.11,
    6.98,
    7.11,
    9.41,
    11.68,
    9.82,
    11.61,
    9.67,
    9.54,
    10.43,
    10.32,
    11.41,
    10.54,
    10.42,
    10.92,
    10.55,
    11.01,
    11.43,
    10.47,
    10.41,
    11.07,
    13.96,
    12.41,
    14.39,
    12.35,
    12.76,
    12.42,
    13.73,
    12.28,
    8.96,
    8.96,
    4.60
  ]
 }
--- a/tests/test_basic_regression.py
+++ b/tests/test_basic_regression.py
@@ -2,9 +2,10 @@
 import logging
 import os
 import re
 import json
 from pathlib import Path
 import pytest
-from numpy.testing import assert_almost_equal
+from pytest import approx
 from propka.parameters import Parameters
 from propka.molecular_container import MolecularContainer
 from propka.input import read_parameter_file, read_molecule_file
@@ -18,6 +19,7 @@ _LOGGER = logging.getLogger(__name__)
 # decimal places in pKa output as well as need to make unmodified code work
 # on WSL Ubuntu 18.04
 MAX_ERR_DECIMALS = 2
 MAX_ERR_ABS = 10**-MAX_ERR_DECIMALS
 # This directory
@@ -82,6 +84,34 @@ def run_propka(options, pdb_path, tmp_path):
        os.chdir(cwd)
 def parse_pka(pka_path: Path) -> dict:
    """Parse testable data from a .pka file into a dictionary.
    """
    pka_list = []
    data = {"pKa": pka_list}
    with open(pka_path, "rt") as pka_file:
        at_pka = False
        for line in pka_file:
            if at_pka:
                if line.startswith("---"):
                    at_pka = False
                else:
                    m = re.search(r'\d+\.\d+', line[13:])
                    pka_list.append(float(m.group()))
            elif "model-pKa" in line:
                at_pka = True
            else:
                m = re.match(
                    r"The pI is *(\d+\.\d+) .folded. and *(\d+\.\d+) .unfolded.",
                    line)
                if m is not None:
                    data["pI_folded"] = float(m.group(1))
                    data["pI_unfolded"] = float(m.group(2))
    return data
 def compare_output(pdb, tmp_path, ref_path):
    """Compare results of test with reference.
@@ -92,31 +122,16 @@ def compare_output(pdb, tmp_path, ref_path):
    Raises:
        ValueError if results disagree.
    """
    ref_data = []
    with open(ref_path, "rt") as ref_file:
-        for line in ref_file:
+        if ref_path.name.endswith(".json"):
-            ref_data.append(float(line))
+            ref_data = json.load(ref_file)
    test_data = []
    pka_path = Path(tmp_path) / ("{0:s}.pka".format(pdb))
    with open(pka_path, "rt") as pka_file:
        at_pka = False
        for line in pka_file:
            if not at_pka:
                if "model-pKa" in line:
                    at_pka = True
            elif line.startswith("---"):
                at_pka = False
        else:
-                match = re.search(r'([0-9]+\.[0-9]+)', line)
+            ref_data = {"pKa": [float(line) for line in ref_file]}
-                value = float(match.group(0))
+
-                test_data.append(value)
+    test_data = parse_pka(tmp_path / f"{pdb}.pka")
-    errstr = (
+
-        "Error exceeds maximum allowed value ({0:d} decimal places)".format(
+    for key in ref_data:
-            MAX_ERR_DECIMALS))
+        assert test_data[key] == approx(ref_data[key], abs=MAX_ERR_ABS), key
    assert_almost_equal(
        test_data, ref_data, decimal=MAX_ERR_DECIMALS, err_msg=errstr,
        verbose=True)
@pytest.mark.parametrize("pdb, options", [
@@ -132,9 +147,12 @@ def compare_output(pdb, tmp_path, ref_path):
 def test_regression(pdb, options, tmp_path):
    """Basic regression test of PROPKA functionality."""
    path_dict = get_test_dirs()
-    ref_path = path_dict["results"] / ("{0:s}.dat".format(pdb))
+
    for ext in ["json", "dat"]:
        ref_path = path_dict["results"] / f"{pdb}.{ext}"
        if ref_path.is_file():
            ref_path = ref_path.resolve()
            break
    else:
        _LOGGER.warning("Missing results file for comparison: {0:s}".format(
            str(ref_path)))
-.07
-.11
-.62
-.55
-.28
-.78
-.91
-.13
-.78
-.93
-.65
-.89
-.73
-.36
-.07
-.70
-.08
-.11
-.98
-.11
-.41
-.68
-.82
-.61
-.67
-.54
-.43
-.32
-.41
-.54
-.42
-.92
-.55
-.01
-.43
-.47
-.41
-.07
-.96
-.41
-.39
-.35
-.76
-.42
-.73
-.28
-.96
-.96
-.60