From 6388e0c4eeec716192f318639d7658280ecf29f6 Mon Sep 17 00:00:00 2001 From: Thomas Holder Date: Mon, 8 Aug 2022 21:18:30 +0200 Subject: [PATCH 1/2] Test pI (isoelectric point) result --- tests/results/1HPX.dat | 49 ----------------------- tests/results/1HPX.json | 55 ++++++++++++++++++++++++++ tests/test_basic_regression.py | 72 +++++++++++++++++++++------------- 3 files changed, 100 insertions(+), 76 deletions(-) delete mode 100644 tests/results/1HPX.dat create mode 100644 tests/results/1HPX.json diff --git a/tests/results/1HPX.dat b/tests/results/1HPX.dat deleted file mode 100644 index 9bf1d8d..0000000 --- a/tests/results/1HPX.dat +++ /dev/null @@ -1,49 +0,0 @@ - 5.07 - 3.11 - 4.62 - 2.55 - 9.28 - 1.78 - 4.91 - 2.13 - 4.78 - 3.93 - 3.65 - 3.89 - 4.73 - 3.36 - 4.07 - 3.70 - 2.08 - 2.11 - 6.98 - 7.11 - 9.41 -11.68 - 9.82 -11.61 - 9.67 - 9.54 -10.43 -10.32 -11.41 -10.54 -10.42 -10.92 -10.55 -11.01 -11.43 -10.47 -10.41 -11.07 -13.96 -12.41 -14.39 -12.35 -12.76 -12.42 -13.73 -12.28 - 8.96 - 8.96 - 4.60 diff --git a/tests/results/1HPX.json b/tests/results/1HPX.json new file mode 100644 index 0000000..90d3992 --- /dev/null +++ b/tests/results/1HPX.json @@ -0,0 +1,55 @@ +{ + "pI_folded": 9.54, + "pI_unfolded": 8.9, + "pKa": [ + 5.07, + 3.11, + 4.62, + 2.55, + 9.28, + 1.78, + 4.91, + 2.13, + 4.78, + 3.93, + 3.65, + 3.89, + 4.73, + 3.36, + 4.07, + 3.70, + 2.08, + 2.11, + 6.98, + 7.11, + 9.41, + 11.68, + 9.82, + 11.61, + 9.67, + 9.54, + 10.43, + 10.32, + 11.41, + 10.54, + 10.42, + 10.92, + 10.55, + 11.01, + 11.43, + 10.47, + 10.41, + 11.07, + 13.96, + 12.41, + 14.39, + 12.35, + 12.76, + 12.42, + 13.73, + 12.28, + 8.96, + 8.96, + 4.60 + ] +} diff --git a/tests/test_basic_regression.py b/tests/test_basic_regression.py index 6cf188a..d604368 100644 --- a/tests/test_basic_regression.py +++ b/tests/test_basic_regression.py @@ -2,9 +2,10 @@ import logging import os import re +import json from pathlib import Path import pytest -from numpy.testing import assert_almost_equal +from pytest import approx from propka.parameters import Parameters from propka.molecular_container import MolecularContainer from propka.input import read_parameter_file, read_molecule_file @@ -18,6 +19,7 @@ _LOGGER = logging.getLogger(__name__) # decimal places in pKa output as well as need to make unmodified code work # on WSL Ubuntu 18.04 MAX_ERR_DECIMALS = 2 +MAX_ERR_ABS = 10**-MAX_ERR_DECIMALS # This directory @@ -82,6 +84,34 @@ def run_propka(options, pdb_path, tmp_path): os.chdir(cwd) +def parse_pka(pka_path: Path) -> dict: + """Parse testable data from a .pka file into a dictionary. + """ + pka_list = [] + data = {"pKa": pka_list} + + with open(pka_path, "rt") as pka_file: + at_pka = False + for line in pka_file: + if at_pka: + if line.startswith("---"): + at_pka = False + else: + m = re.search(r'\d+\.\d+', line[13:]) + pka_list.append(float(m.group())) + elif "model-pKa" in line: + at_pka = True + else: + m = re.match( + r"The pI is *(\d+\.\d+) .folded. and *(\d+\.\d+) .unfolded.", + line) + if m is not None: + data["pI_folded"] = float(m.group(1)) + data["pI_unfolded"] = float(m.group(2)) + + return data + + def compare_output(pdb, tmp_path, ref_path): """Compare results of test with reference. @@ -92,31 +122,16 @@ def compare_output(pdb, tmp_path, ref_path): Raises: ValueError if results disagree. """ - ref_data = [] with open(ref_path, "rt") as ref_file: - for line in ref_file: - ref_data.append(float(line)) + if ref_path.name.endswith(".json"): + ref_data = json.load(ref_file) + else: + ref_data = {"pKa": [float(line) for line in ref_file]} - test_data = [] - pka_path = Path(tmp_path) / ("{0:s}.pka".format(pdb)) - with open(pka_path, "rt") as pka_file: - at_pka = False - for line in pka_file: - if not at_pka: - if "model-pKa" in line: - at_pka = True - elif line.startswith("---"): - at_pka = False - else: - match = re.search(r'([0-9]+\.[0-9]+)', line) - value = float(match.group(0)) - test_data.append(value) - errstr = ( - "Error exceeds maximum allowed value ({0:d} decimal places)".format( - MAX_ERR_DECIMALS)) - assert_almost_equal( - test_data, ref_data, decimal=MAX_ERR_DECIMALS, err_msg=errstr, - verbose=True) + test_data = parse_pka(tmp_path / f"{pdb}.pka") + + for key in ref_data: + assert test_data[key] == approx(ref_data[key], abs=MAX_ERR_ABS), key @pytest.mark.parametrize("pdb, options", [ @@ -132,9 +147,12 @@ def compare_output(pdb, tmp_path, ref_path): def test_regression(pdb, options, tmp_path): """Basic regression test of PROPKA functionality.""" path_dict = get_test_dirs() - ref_path = path_dict["results"] / ("{0:s}.dat".format(pdb)) - if ref_path.is_file(): - ref_path = ref_path.resolve() + + for ext in ["json", "dat"]: + ref_path = path_dict["results"] / f"{pdb}.{ext}" + if ref_path.is_file(): + ref_path = ref_path.resolve() + break else: _LOGGER.warning("Missing results file for comparison: {0:s}".format( str(ref_path))) From 80c7bf07cd06ad4dfe43d566116703e26cfee0d1 Mon Sep 17 00:00:00 2001 From: Thomas Holder Date: Mon, 8 Aug 2022 21:21:45 +0200 Subject: [PATCH 2/2] Refactor MolecularContainer.get_pi Use bisection instead of grid search. This is ~10x faster and also fixes pI computation for one of my files which only got single digit precision with the old code. --- propka/molecular_container.py | 57 ++++++++++++++++------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/propka/molecular_container.py b/propka/molecular_container.py index 5222d31..f1b2b36 100644 --- a/propka/molecular_container.py +++ b/propka/molecular_container.py @@ -15,12 +15,6 @@ from propka.lib import make_grid _LOGGER = logging.getLogger(__name__) -# TODO - these are constants whose origins are a little murky -UNK_PI_CUTOFF = 0.01 -# Maximum number of iterations for finding PI -MAX_ITERATION = 4 - - class MolecularContainer: """Container for storing molecular contents of PDB files. @@ -207,37 +201,38 @@ class MolecularContainer: charge_profile.append([ph, q_unfolded, q_folded]) return charge_profile - def get_pi(self, conformation='AVR', grid=[0., 14., 1], iteration=0): + def get_pi(self, conformation='AVR', grid=[0., 14., 1], *, + precision: float = 1e-4): """Get the isoelectric points for folded and unfolded states. Args: conformation: conformation to test grid: grid of pH values [min, max, step] - iteration: iteration number of process + precision: Compute pI up to this precision Returns: 1. Folded state PI 2. Unfolded state PI """ - charge_profile = self.get_charge_profile( - conformation=conformation, grid=grid) - pi_folded = pi_unfolded = [None, 1e6, 1e6] - for point in charge_profile: - pi_folded = min(pi_folded, point, key=lambda v: abs(v[2])) - pi_unfolded = min(pi_unfolded, point, key=lambda v: abs(v[1])) - # If results are not good enough, do it again with a higher sampling - # resolution - pi_folded_value = pi_folded[0] - pi_unfolded_value = pi_unfolded[0] - step = grid[2] - # TODO - need to warn if maximum number of iterations is exceeded - if ((pi_folded[2] > UNK_PI_CUTOFF - or pi_unfolded[1] > UNK_PI_CUTOFF) and iteration < MAX_ITERATION): - pi_folded_value, _ = self.get_pi( - conformation=conformation, - grid=[pi_folded[0]-step, pi_folded[0]+step, step/10.0], - iteration=iteration+1) - _, pi_unfolded_value = self.get_pi( - conformation=conformation, - grid=[pi_unfolded[0]-step, pi_unfolded[0]+step, step/10.0], - iteration=iteration+1) - return pi_folded_value, pi_unfolded_value + conf = self.conformations[conformation] + + WHICH_UNFOLDED = 0 + WHICH_FOLDED = 1 + + def pi(which, pH, min_, max_): + charge = conf.calculate_charge( + self.version.parameters, ph=pH)[which] + if max_ - min_ > precision: + if charge > 0.0: + min_ = pH + else: + max_ = pH + next_pH = (min_ + max_) / 2 + return pi(which, next_pH, min_, max_) + return pH + + start = (grid[0] + grid[1]) / 2, grid[0], grid[1] + + return ( + pi(WHICH_FOLDED, *start), + pi(WHICH_UNFOLDED, *start), + )