Initial commit: digital-patients pipeline (clean, no large files)

Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
2026-03-26 15:15:23 +01:00
commit 9e6a16c19b
45 changed files with 7207 additions and 0 deletions

5
app_filter/README.md Normal file
View File

@@ -0,0 +1,5 @@
Run command
```
vep --config vep.ini -i ${input_vcf} -o ${input_vcf.simpleName}_cosmic.vcf
```

View File

@@ -0,0 +1,56 @@
from argparse import ArgumentParser
import pandas as pd
import numpy as np
from pathlib import Path
def make_argparser():
parser = ArgumentParser(
description="Filter Transcriptome using mutated and non-mutated TPM values, and MANE dataset"
)
parser.add_argument(
"--mutated",
type=Path,
required=True,
help="The .csv file with mutated TPM."
)
parser.add_argument(
"--non-mutated",
type=Path,
required=True,
help="The .csv file with base (non-mutated) TPM"
)
parser.add_argument(
"--mane-data",
type=Path,
required=True,
help="The .tsv file with MANE data"
)
return parser
def main(mutated : Path, non_mutated : Path, mane_data : Path):
name = mutated.stem
df_mut = pd.read_csv(mutated)
df_nomut = pd.read_csv(non_mutated)
df_mane = pd.read_csv(mane_data, sep="\t")
df_mane["ENSG"] = [i.split('.')[0] for i in df_mane["Ensembl_Gene"]]
df_mane_map = df_mane[['symbol','ENSG']]
df_nomut_mapped = df_mane_map.merge(df_nomut, left_on='symbol', right_on='Unnamed: 0').dropna().drop_duplicates(subset=['symbol']).drop(["symbol", "Unnamed: 0"],axis=1)
df_merge = df_nomut_mapped.merge(df_mut, on="ENSG", suffixes=("_nomut","_mut"))
cols = df_mut.columns[1:]
df_res = pd.DataFrame({"ENSG": df_merge["ENSG"]} | {col:np.log2(df_merge[col + "_mut"] / df_merge[col + "_nomut"]) for col in cols})
df_res.to_csv(f"{name}_log2fc.csv", index=None)
if __name__ == "__main__":
parser = make_argparser()
args = parser.parse_args()
main(mutated=args.mutated, non_mutated=args.non_mutated, mane_data=args.mane_data)

32
app_filter/vep.ini Normal file
View File

@@ -0,0 +1,32 @@
cache 1
dir_cache /data
assembly 'GRCh38'
offline 1
force_overwrite 1
custom file=/opt/vep/.vep/Cosmic_GenomeScreensMutant_v103_GRCh38.vcf.gz,short_name=COSMIC,format=vcf,type=exact,fields=LEGACY_ID%CNT
# Functional impact predictions
sift b # CALCULATE 1 - score
polyphen b # THESE TWO CAN BE AVERAGED
plugin REVEL,/opt/vep/.vep/REVEL/new_tabbed_revel_grch38.tsv.gz
# TODO: CALCUALTE THEIR L2 NORM
# plugin CADD,/opt/vep/.vep/CADD/GRCh38/whole_genome_SNVs.tsv.gz,/opt/vep/.vep/CADD/GRCh38/InDels.tsv.gz
# Clinical significance
check_existing
clinvar
# Allele frequencies
af
af_1kg
af_gnomade
af_gnomadg
max_af
# Additional useful flags
numbers
hgvs
symbol
canonical