Files
immunebuilder/ImmuneBuilder/sequence_checks.py
Olamide Isreal 8887cbe592
Some checks failed
CodeQL / Analyze (python) (push) Has been cancelled
Configure ImmuneBuilder pipeline for WES execution
- Update container image to harbor.cluster.omic.ai/omic/immunebuilder:latest
- Update input/output paths to S3 (s3://omic/eureka/immunebuilder/)
- Remove local mount containerOptions (not needed in k8s)
- Update homepage to Gitea repo URL
- Clean history to remove large model weight blobs
2026-03-16 15:31:53 +01:00

41 lines
1.7 KiB
Python

from anarci import validate_sequence, anarci, scheme_short_to_long
def number_single_sequence(sequence, chain, scheme="imgt", allowed_species=['human','mouse']):
validate_sequence(sequence)
try:
if scheme != "raw":
scheme = scheme_short_to_long[scheme.lower()]
except KeyError:
raise NotImplementedError(f"Unimplemented numbering scheme: {scheme}")
assert len(sequence) > 70, f"Sequence too short to be an Ig domain. Please give whole sequence:\n{sequence}"
allow = [chain]
if chain == "L":
allow.append("K")
# Use imgt scheme for numbering sanity checks
numbered, _, _ = anarci([("sequence", sequence)], scheme='imgt', output=False, allow=set(allow), allowed_species=allowed_species)
assert numbered[0], f"Sequence provided as an {chain} chain is not recognised as an {chain} chain."
output = [x for x in numbered[0][0][0] if x[1] != "-"]
numbers = [x[0][0] for x in output]
# Check for missing residues assuming imgt numbering
assert (max(numbers) > 120) and (min(numbers) < 8), f"Sequence missing too many residues to model correctly. Please give whole sequence:\n{sequence}"
# Renumber once sanity checks done
if scheme == "raw":
output = [((i+1, " "),x[1]) for i,x in enumerate(output)]
elif scheme != 'imgt':
numbered, _, _ = anarci([("sequence", sequence)], scheme=scheme, output=False, allow=set(allow), allowed_species=allowed_species)
output = [x for x in numbered[0][0][0] if x[1] != "-"]
return output
def number_sequences(seqs, scheme="imgt", allowed_species=['human','mouse']):
return {chain: number_single_sequence(seqs[chain], chain, scheme=scheme, allowed_species=allowed_species) for chain in seqs}