De-lint propka.py.

This commit is contained in:
Nathan Baker
2020-05-25 11:54:41 -07:00
parent 47226bb4b0
commit b3e2685e94

View File

@@ -1,132 +1,136 @@
"""PDB parsing functionality."""
from __future__ import division
from __future__ import print_function
import string, sys, copy
import propka.lib import propka.lib
from propka.lib import info, warning from propka.lib import warning
from propka.atom import Atom from propka.atom import Atom
from propka.conformation_container import ConformationContainer from propka.conformation_container import ConformationContainer
expected_atom_numbers = {'ALA':5,
'ARG':11, EXPECTED_ATOM_NUMBERS = {'ALA': 5, 'ARG': 11, 'ASN': 8, 'ASP': 8, 'CYS': 6,
'ASN':8, 'GLY': 4, 'GLN': 9, 'GLU': 9, 'HIS': 10, 'ILE': 8,
'ASP':8, 'LEU': 8, 'LYS': 9, 'MET': 8, 'PHE': 11, 'PRO': 7,
'CYS':6, 'SER': 6, 'THR': 7, 'TRP': 14, 'TYR': 12, 'VAL': 7}
'GLY':4,
'GLN':9,
'GLU':9,
'HIS':10,
'ILE':8,
'LEU':8,
'LYS':9,
'MET':8,
'PHE':11,
'PRO':7,
'SER':6,
'THR':7,
'TRP':14,
'TYR':12,
'VAL':7}
def read_pdb(pdb_file, parameters, molecule): def read_pdb(pdb_file, parameters, molecule):
conformations = {} """Parse a PDB file.
Args:
pdb_file: file to read
parameters: parameters to guide parsing
molecule: molecular container
Returns:
list with elements:
1. list of conformations
2. list of names
"""
conformations = {}
# read in all atoms in the file # read in all atoms in the file
lines = get_atom_lines_from_pdb(pdb_file, ignore_residues = parameters.ignore_residues, keep_protons = molecule.options.keep_protons, chains=molecule.options.chains) lines = get_atom_lines_from_pdb(pdb_file,
ignore_residues=parameters.ignore_residues,
keep_protons=molecule.options.keep_protons,
chains=molecule.options.chains)
for (name, atom) in lines: for (name, atom) in lines:
if not name in conformations.keys(): if not name in conformations.keys():
conformations[name] = ConformationContainer(name=name, parameters=parameters, molecular_container=molecule) conformations[name] = ConformationContainer(name=name,
parameters=parameters,
molecular_container=molecule)
conformations[name].add_atom(atom) conformations[name].add_atom(atom)
# make a sorted list of conformation names # make a sorted list of conformation names
names = sorted(conformations.keys(), key=propka.lib.conformation_sorter) names = sorted(conformations.keys(), key=propka.lib.conformation_sorter)
return [conformations, names] return [conformations, names]
def protein_precheck(conformations, names):
def protein_precheck(conformations, names):
"""Check protein for correct number of atoms, etc.
Args:
names: conformation names to check
"""
for name in names: for name in names:
atoms = conformations[name].atoms atoms = conformations[name].atoms
# Group the atoms by their residue: # Group the atoms by their residue:
atoms_by_residue = {} atoms_by_residue = {}
for a in atoms: for atom in atoms:
if a.element != 'H': if atom.element != 'H':
res_id = resid_from_atom(a) res_id = resid_from_atom(atom)
try: try:
atoms_by_residue[res_id].append(a) atoms_by_residue[res_id].append(atom)
except KeyError: except KeyError:
atoms_by_residue[res_id] = [a] atoms_by_residue[res_id] = [atom]
for res_id, res_atoms in atoms_by_residue.items(): for res_id, res_atoms in atoms_by_residue.items():
res_name = res_atoms[0].res_name res_name = res_atoms[0].res_name
residue_label = '%3s%5s'%(res_name, res_id) residue_label = '%3s%5s'%(res_name, res_id)
# ignore ligand residues # ignore ligand residues
if res_name not in expected_atom_numbers: if res_name not in EXPECTED_ATOM_NUMBERS:
continue continue
# check for c-terminal # check for c-terminal
if 'C-' in [a.terminal for a in res_atoms]: if 'C-' in [a.terminal for a in res_atoms]:
if len(res_atoms) != expected_atom_numbers[res_name]+1: if len(res_atoms) != EXPECTED_ATOM_NUMBERS[res_name]+1:
warning('Unexpected number (%d) of atoms in residue %s in conformation %s' % (len(res_atoms), residue_label, name)) str_ = ("Unexpected number (%d) of atoms in residue %s "
"in conformation %s" % (len(res_atoms),
residue_label, name))
warning(str_)
continue continue
# check number of atoms in residue # check number of atoms in residue
if len(res_atoms) != expected_atom_numbers[res_name]: if len(res_atoms) != EXPECTED_ATOM_NUMBERS[res_name]:
warning('Unexpected number (%d) of atoms in residue %s in conformation %s' % (len(res_atoms), residue_label, name)) str_ = ('Unexpected number (%d) of atoms in residue %s '
'in conformation %s' % (len(res_atoms),
return residue_label, name))
warning(str_)
def resid_from_atom(a):
return '%4d %s %s'%(a.res_num,a.chain_id,a.icode)
def get_atom_lines_from_pdb(pdb_file, ignore_residues = [], keep_protons=False, tags = ['ATOM ', 'HETATM'], chains=None): def resid_from_atom(atom):
"""Return string with atom residue information.
Args:
atom: atom to generate string for
Returns
string
"""
return '%4d %s %s' % (atom.res_num, atom.chain_id, atom.icode)
def get_atom_lines_from_pdb(pdb_file, ignore_residues=[], keep_protons=False,
tags=['ATOM ', 'HETATM'], chains=None):
"""Get atom lines from PDB file.
Args:
pdb_file: PDB file to parse
ignore_residues: list of residues to ignore
keep_protons: bool to keep/ignore protons
tags: tags of lines that include atoms
chains: list of chains
"""
lines = propka.lib.open_file_for_reading(pdb_file).readlines() lines = propka.lib.open_file_for_reading(pdb_file).readlines()
nterm_residue = 'next_residue' nterm_residue = 'next_residue'
old_residue = None old_residue = None
terminal = None terminal = None
model = 1 model = 1
for line in lines: for line in lines:
tag = line[0:6] tag = line[0:6]
# set the model number # set the model number
if tag == 'MODEL ': if tag == 'MODEL ':
model = int(line[6:]) model = int(line[6:])
nterm_residue = 'next_residue' nterm_residue = 'next_residue'
if tag == 'TER ': if tag == 'TER ':
nterm_residue = 'next_residue' nterm_residue = 'next_residue'
if tag in tags: if tag in tags:
alt_conf_tag = line[16] alt_conf_tag = line[16]
residue_name = line[12: 16] residue_name = line[12: 16]
residue_number = line[22: 26] residue_number = line[22: 26]
# check if we want this residue # check if we want this residue
if line[17: 20] in ignore_residues: if line[17: 20] in ignore_residues:
continue continue
if chains and line[21] not in chains: if chains and line[21] not in chains:
continue continue
# set the Nterm residue number - nessecary because we may need to # set the Nterm residue number - nessecary because we may need to
# identify more than one N+ group for structures with alt_conf tags # identify more than one N+ group for structures with alt_conf tags
if nterm_residue == 'next_residue' and tag == 'ATOM ': if nterm_residue == 'next_residue' and tag == 'ATOM ':
# make sure that we reached a new residue - nessecary if OXT is not the last atom in # make sure that we reached a new residue - nessecary if OXT is
# the previous residue # not the last atom inthe previous residue
if old_residue != residue_number: if old_residue != residue_number:
nterm_residue = residue_number nterm_residue = residue_number
old_residue = None old_residue = None
# Identify the configuration # Identify the configuration
# convert digits to letters # convert digits to letters
if alt_conf_tag in '123456789': if alt_conf_tag in '123456789':
@@ -134,10 +138,10 @@ def get_atom_lines_from_pdb(pdb_file, ignore_residues = [], keep_protons=False,
if alt_conf_tag == ' ': if alt_conf_tag == ' ':
alt_conf_tag = 'A' alt_conf_tag = 'A'
conformation = '%d%s'%(model, alt_conf_tag) conformation = '%d%s'%(model, alt_conf_tag)
# set the terminal # set the terminal
if tag == 'ATOM ': if tag == 'ATOM ':
if residue_name.strip() == 'N' and nterm_residue == residue_number: if (residue_name.strip() == 'N'
and nterm_residue == residue_number):
terminal = 'N+' terminal = 'N+'
if residue_name.strip() in ['OXT', 'O\'\'']: if residue_name.strip() in ['OXT', 'O\'\'']:
terminal = 'C-' terminal = 'C-'
@@ -146,92 +150,102 @@ def get_atom_lines_from_pdb(pdb_file, ignore_residues = [], keep_protons=False,
# and yield the atom # and yield the atom
atom = Atom(line=line) atom = Atom(line=line)
atom.terminal = terminal atom.terminal = terminal
#if keep_protons:
# atom.is_protonated = True
if not (atom.element == 'H' and not keep_protons): #ignore hydrogen if not (atom.element == 'H' and not keep_protons): #ignore hydrogen
yield (conformation, atom) yield (conformation, atom)
terminal = None terminal = None
return
def write_pdb(conformation, filename): def write_pdb(conformation, filename):
"""Write PDB conformation to a file.
Args:
conformation: conformation container
filename: filename for output
"""
write_pdb_for_atoms(conformation.atoms, filename) write_pdb_for_atoms(conformation.atoms, filename)
return
def write_pdb_for_atoms(atoms, filename, make_conect_section=False): def write_pdb_for_atoms(atoms, filename, make_conect_section=False):
out = propka.lib.open_file_for_writing(filename) """Write out PDB file for atoms.
Args:
atoms: list of atoms
filename: name of file
make_conect_section: generate a CONECT PDB section
"""
out = propka.lib.open_file_for_writing(filename)
for atom in atoms: for atom in atoms:
out.write(atom.make_pdb_line()) out.write(atom.make_pdb_line())
if make_conect_section: if make_conect_section:
for atom in atoms: for atom in atoms:
out.write(atom.make_conect_line()) out.write(atom.make_conect_line())
out.close() out.close()
return
def write_mol2_for_atoms(atoms, filename): def write_mol2_for_atoms(atoms, filename):
"""Write out MOL2 file for atoms.
Args:
atoms: list of atoms
filename: name of file
"""
# TODO - header needs to be converted to format string
header = '@<TRIPOS>MOLECULE\n\n%d %d\nSMALL\nUSER_CHARGES\n' header = '@<TRIPOS>MOLECULE\n\n%d %d\nSMALL\nUSER_CHARGES\n'
atoms_section = '@<TRIPOS>ATOM\n' atoms_section = '@<TRIPOS>ATOM\n'
for i in range(len(atoms)): for i, atom in enumerate(atoms):
atoms_section += atoms[i].make_mol2_line(i+1) atoms_section += atom.make_mol2_line(i+1)
bonds_section = '@<TRIPOS>BOND\n' bonds_section = '@<TRIPOS>BOND\n'
id = 1 id_ = 1
for i in range(len(atoms)): for i, atom1 in enumerate(atoms):
for j in range(i+1,len(atoms)): for j, atom2 in enumerate(atoms, i+1):
if atoms[i] in atoms[j].bonded_atoms: if atom1 in atom2.bonded_atoms:
type = get_bond_order(atoms[i],atoms[j]) type_ = get_bond_order(atom1, atom2)
bonds_section += '%7d %7d %7d %7s\n'%(id, i+1, j+1, type) bonds_section += '%7d %7d %7d %7s\n' % (id_, i+1, j+1, type_)
id+=1 id_ += 1
substructure_section = '@<TRIPOS>SUBSTRUCTURE\n\n' substructure_section = '@<TRIPOS>SUBSTRUCTURE\n\n'
if len(atoms) > 0: if len(atoms) > 0:
substructure_section = '@<TRIPOS>SUBSTRUCTURE\n%-7d %10s %7d\n'%(atoms[0].res_num,atoms[0].res_name,atoms[0].numb) substructure_section = ('@<TRIPOS>SUBSTRUCTURE\n%-7d %10s %7d\n'
% (atoms[0].res_num, atoms[0].res_name,
atoms[0].numb))
out = propka.lib.open_file_for_writing(filename) out = propka.lib.open_file_for_writing(filename)
out.write(header%(len(atoms),id-1)) out.write(header % (len(atoms), id_-1))
out.write(atoms_section) out.write(atoms_section)
out.write(bonds_section) out.write(bonds_section)
out.write(substructure_section) out.write(substructure_section)
out.close() out.close()
return
def get_bond_order(atom1, atom2): def get_bond_order(atom1, atom2):
type = '1' """Get the order of a bond between two atoms.
Args:
atom1: first atom in bond
atom2: second atom in bond
Returns:
string with bond type
"""
type_ = '1'
pi_electrons1 = atom1.num_pi_elec_2_3_bonds pi_electrons1 = atom1.num_pi_elec_2_3_bonds
pi_electrons2 = atom2.num_pi_elec_2_3_bonds pi_electrons2 = atom2.num_pi_elec_2_3_bonds
if '.ar' in atom1.sybyl_type: if '.ar' in atom1.sybyl_type:
pi_electrons1 -= 1 pi_electrons1 -= 1
if '.ar' in atom2.sybyl_type: if '.ar' in atom2.sybyl_type:
pi_electrons2 -= 1 pi_electrons2 -= 1
if pi_electrons1 > 0 and pi_electrons2 > 0: if pi_electrons1 > 0 and pi_electrons2 > 0:
type = '%d'%(min(pi_electrons1, pi_electrons2)+1) type_ = '%d' % (min(pi_electrons1, pi_electrons2)+1)
if '.ar' in atom1.sybyl_type and '.ar' in atom2.sybyl_type: if '.ar' in atom1.sybyl_type and '.ar' in atom2.sybyl_type:
type = 'ar' type_ = 'ar'
return type_
return type
def write_input(molecular_container, filename): def write_input(molecular_container, filename):
out = propka.lib.open_file_for_writing(filename) """Write PROPKA input file for molecular container.
Args:
molecular_container: molecular container
filename: output file name
"""
out = propka.lib.open_file_for_writing(filename)
for conformation_name in molecular_container.conformation_names: for conformation_name in molecular_container.conformation_names:
out.write('MODEL %s\n' % conformation_name) out.write('MODEL %s\n' % conformation_name)
# write atoms # write atoms
@@ -247,43 +261,51 @@ def write_input(molecular_container, filename):
for group in molecular_container.conformations[conformation_name].groups: for group in molecular_container.conformations[conformation_name].groups:
out.write(group.make_non_covalently_coupled_line()) out.write(group.make_non_covalently_coupled_line())
out.write('ENDMDL\n') out.write('ENDMDL\n')
out.close() out.close()
return
def read_input(input_file, parameters, molecule): def read_input(input_file, parameters, molecule):
conformations = {} """Read PROPKA input file for molecular container.
Args:
input_file: input file
parameters: parameters for parsing/setup
molecule: molecular container
Returns:
list with [conformations, names of conformations]
"""
conformations = {}
# read in all atoms in the input file # read in all atoms in the input file
lines = get_atom_lines_from_input(input_file) lines = get_atom_lines_from_input(input_file)
for (name, atom) in lines: for (name, atom) in lines:
if not name in conformations.keys(): if not name in conformations.keys():
conformations[name] = ConformationContainer(name=name, parameters=parameters, molecular_container=molecule) conformations[name] = ConformationContainer(
name=name, parameters=parameters,
molecular_container=molecule)
conformations[name].add_atom(atom) conformations[name].add_atom(atom)
# make a sorted list of conformation names # make a sorted list of conformation names
names = sorted(conformations.keys(), key=propka.lib.conformation_sorter) names = sorted(conformations.keys(), key=propka.lib.conformation_sorter)
return [conformations, names] return [conformations, names]
def get_atom_lines_from_input(input_file, tags=['ATOM ', 'HETATM']): def get_atom_lines_from_input(input_file, tags=['ATOM ', 'HETATM']):
"""Get atom lines from a PROPKA input file.
Args:
input_file: input file
tags: tags defining atom lines
Yields:
conformation container, list of atoms
"""
lines = propka.lib.open_file_for_reading(input_file).readlines() lines = propka.lib.open_file_for_reading(input_file).readlines()
conformation = '' conformation = ''
atoms = {} atoms = {}
numbers = [] numbers = []
for line in lines: for line in lines:
tag = line[0:6] tag = line[0:6]
# set the conformation # set the conformation
if tag == 'MODEL ': if tag == 'MODEL ':
conformation = line[6:].strip() conformation = line[6:].strip()
# found an atom - save it # found an atom - save it
if tag in tags: if tag in tags:
atom = Atom(line=line) atom = Atom(line=line)
@@ -292,46 +314,39 @@ def get_atom_lines_from_input(input_file, tags = ['ATOM ','HETATM']):
atom.is_protonated = True atom.is_protonated = True
atoms[atom.numb] = atom atoms[atom.numb] = atom
numbers.append(atom.numb) numbers.append(atom.numb)
# found bonding information - apply it # found bonding information - apply it
if tag == 'CONECT' and len(line) > 14: if tag == 'CONECT' and len(line) > 14:
conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)] conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)]
center_atom = atoms[int(conect_numbers[0])] center_atom = atoms[int(conect_numbers[0])]
for n in conect_numbers[1:]: for num in conect_numbers[1:]:
b = atoms[int(n)] bond_atom = atoms[int(num)]
# remember to check for cysteine bridges # remember to check for cysteine bridges
if center_atom.element == 'S' and b.element == 'S': if center_atom.element == 'S' and bond_atom.element == 'S':
center_atom.cysteine_bridge = True center_atom.cysteine_bridge = True
b.cysteine_bridge = True bond_atom.cysteine_bridge = True
# set up bonding # set up bonding
if not b in center_atom.bonded_atoms: if not bond_atom in center_atom.bonded_atoms:
center_atom.bonded_atoms.append(b) center_atom.bonded_atoms.append(bond_atom)
if not center_atom in b.bonded_atoms: if not center_atom in bond_atom.bonded_atoms:
b.bonded_atoms.append(center_atom) bond_atom.bonded_atoms.append(center_atom)
# found info on covalent coupling # found info on covalent coupling
if tag == 'CCOUPL' and len(line) > 14: if tag == 'CCOUPL' and len(line) > 14:
conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)] conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)]
center_atom = atoms[int(conect_numbers[0])] center_atom = atoms[int(conect_numbers[0])]
for n in conect_numbers[1:]: for num in conect_numbers[1:]:
cg = atoms[int(n)] cov_atom = atoms[int(num)]
center_atom.group.couple_covalently(cg.group) center_atom.group.couple_covalently(cov_atom.group)
# found info on non-covalent coupling # found info on non-covalent coupling
if tag == 'NCOUPL' and len(line) > 14: if tag == 'NCOUPL' and len(line) > 14:
conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)] conect_numbers = [line[i:i+5] for i in range(6, len(line)-1, 5)]
center_atom = atoms[int(conect_numbers[0])] center_atom = atoms[int(conect_numbers[0])]
for n in conect_numbers[1:]: for num in conect_numbers[1:]:
cg = atoms[int(n)] cov_atom = atoms[int(num)]
center_atom.group.couple_non_covalently(cg.group) center_atom.group.couple_non_covalently(cov_atom.group)
# this conformation is done - yield the atoms # this conformation is done - yield the atoms
if tag == 'ENDMDL': if tag == 'ENDMDL':
for n in numbers: for num in numbers:
yield (conformation, atoms[n]) yield (conformation, atoms[num])
# prepare for next conformation # prepare for next conformation
atoms = {} atoms = {}
numbers = [] numbers = []
return