""" Functions to read PDB/mmCIF files """ import logging import sys import typing import warnings from pathlib import Path from typing import Optional, Union from Bio.PDB.Atom import DisorderedAtom from Bio.PDB.Chain import Chain from Bio.PDB.MMCIFParser import MMCIFParser from Bio.PDB.Model import Model from Bio.PDB.PDBExceptions import PDBConstructionWarning from Bio.PDB.PDBParser import PDBParser from Bio.PDB.Polypeptide import PPBuilder, is_aa from Bio.PDB.Structure import Structure warnings.filterwarnings("ignore", category=PDBConstructionWarning) log = logging.getLogger("Prodigy") def get_parser(input_f: Path) -> Union[PDBParser, MMCIFParser]: if input_f.suffix == ".cif": return MMCIFParser() else: return PDBParser() def ignore(r): return r.id[0][0] == "W" or r.id[0][0] == "H" def validate_structure( input_strcture_obj: Structure, selection: Optional[list[str]] = None, clean: bool = True, ) -> list[Model]: result: list[Model] = [] for model in [m for m in input_strcture_obj.child_list]: # process selected chains chains: list[Chain] = list(model.get_chains()) chain_ids = set([c.id for c in chains]) if selection: sel_chains = [] # Match selected chain with structure for sel in selection: for c_str in sel.split(","): sel_chains.append(c_str) if c_str not in chain_ids: raise ValueError( f"Selected chain not present in provided structure: {c_str}" ) # Remove unselected chains def _ignore_helper(x) -> bool: return x.id not in sel_chains for c in chains: if _ignore_helper(c): if c.parent is not None: c.parent.detach_child(c.id) # Double occupancy check for atom in list(model.get_atoms()): if atom.is_disordered(): atom = typing.cast(DisorderedAtom, atom) residue = atom.parent assert residue is not None sel_at = atom.selected_child assert sel_at is not None sel_at.altloc = " " sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) # Insertion code check for c in chains: for residue in c.get_residues(): if residue.get_id()[2] != " ": c.detach_child(residue.id) if clean: # Remove HETATMs and solvent res_list = list(model.get_residues()) for res in res_list: if ignore(res): chain = res.parent assert chain is not None chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError( "Unsupported non-standard amino acid found: {0}".format( res.resname ) ) # Remove Hydrogens atom_list = list(model.get_atoms()) def _ignore(x): return x.element == "H" for atom in atom_list: if _ignore(atom): residue = atom.parent assert residue is not None residue.detach_child(atom.name) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(model) n_peptides = len(peptides) if n_peptides != len(chain_ids): message = "[!] Structure contains gaps:\n" for i_pp, pp in enumerate(peptides): message += ( "\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > " "{2.parent.id} {2.resname}{2.id[1]}\n".format(i_pp, pp[0], pp[-1]) ) log.warning(message) result.append(model) return result def parse_structure(path: str) -> tuple[list[Model], int, int]: """Return a validated `Structure`, number of chains and number of residues""" extension = Path(path).suffix supported_extensions = [".pdb", ".cif", ".ent"] if extension not in supported_extensions: log.error( f"[!] Structure format '{extension}' is " "not supported. Use '.pdb' or '.cif'." ) sys.exit(1) parser = get_parser(Path(path)) structure_name = Path(path).stem structure_path = Path(path) try: original_structure = parser.get_structure(structure_name, structure_path) except Exception as e: log.exception(e) sys.exit(1) assert isinstance(original_structure, Structure) models: list[Model] = validate_structure(original_structure) # Get number of chains chain_dict = {} res_dict = {} for model in models: chain_dict.update({c.id: c for c in model.get_chains()}) res_dict.update({r.id: r for r in model.get_residues()}) ## Make sure all models have the same chains # Get chain sets for all models chain_sets = [set(chain.id for chain in model.get_chains()) for model in models] # Check if all sets are identical if not all(chain_set == chain_sets[0] for chain_set in chain_sets): raise ValueError( "Not all models have the same chains. Found chain sets: " + ", ".join(str(s) for s in chain_sets) ) res_sets = [set(res.id for res in model.get_residues()) for model in models] if not all(res_set == res_sets[0] for res_set in res_sets): raise ValueError( "Not all models have the same residues. Found residue sets: " + ", ".join(str(s) for s in res_sets) ) # structure, n_chains, n_res = parse_structure(path=str(struct_path)) return (models, len(chain_sets[0]), len(res_sets[0]))