Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
This commit is contained in:
5
app_filter/README.md
Normal file
5
app_filter/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
Run command
|
||||
|
||||
```
|
||||
vep --config vep.ini -i ${input_vcf} -o ${input_vcf.simpleName}_cosmic.vcf
|
||||
```
|
||||
56
app_filter/filter_transcriptome.py
Normal file
56
app_filter/filter_transcriptome.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from argparse import ArgumentParser
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
def make_argparser():
|
||||
parser = ArgumentParser(
|
||||
description="Filter Transcriptome using mutated and non-mutated TPM values, and MANE dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mutated",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="The .csv file with mutated TPM."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--non-mutated",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="The .csv file with base (non-mutated) TPM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mane-data",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="The .tsv file with MANE data"
|
||||
)
|
||||
return parser
|
||||
|
||||
def main(mutated : Path, non_mutated : Path, mane_data : Path):
|
||||
|
||||
name = mutated.stem
|
||||
|
||||
df_mut = pd.read_csv(mutated)
|
||||
df_nomut = pd.read_csv(non_mutated)
|
||||
|
||||
df_mane = pd.read_csv(mane_data, sep="\t")
|
||||
df_mane["ENSG"] = [i.split('.')[0] for i in df_mane["Ensembl_Gene"]]
|
||||
df_mane_map = df_mane[['symbol','ENSG']]
|
||||
|
||||
df_nomut_mapped = df_mane_map.merge(df_nomut, left_on='symbol', right_on='Unnamed: 0').dropna().drop_duplicates(subset=['symbol']).drop(["symbol", "Unnamed: 0"],axis=1)
|
||||
|
||||
df_merge = df_nomut_mapped.merge(df_mut, on="ENSG", suffixes=("_nomut","_mut"))
|
||||
|
||||
cols = df_mut.columns[1:]
|
||||
|
||||
df_res = pd.DataFrame({"ENSG": df_merge["ENSG"]} | {col:np.log2(df_merge[col + "_mut"] / df_merge[col + "_nomut"]) for col in cols})
|
||||
|
||||
df_res.to_csv(f"{name}_log2fc.csv", index=None)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = make_argparser()
|
||||
args = parser.parse_args()
|
||||
|
||||
main(mutated=args.mutated, non_mutated=args.non_mutated, mane_data=args.mane_data)
|
||||
32
app_filter/vep.ini
Normal file
32
app_filter/vep.ini
Normal file
@@ -0,0 +1,32 @@
|
||||
cache 1
|
||||
dir_cache /data
|
||||
assembly 'GRCh38'
|
||||
offline 1
|
||||
force_overwrite 1
|
||||
custom file=/opt/vep/.vep/Cosmic_GenomeScreensMutant_v103_GRCh38.vcf.gz,short_name=COSMIC,format=vcf,type=exact,fields=LEGACY_ID%CNT
|
||||
|
||||
# Functional impact predictions
|
||||
sift b # CALCULATE 1 - score
|
||||
polyphen b # THESE TWO CAN BE AVERAGED
|
||||
|
||||
plugin REVEL,/opt/vep/.vep/REVEL/new_tabbed_revel_grch38.tsv.gz
|
||||
# TODO: CALCUALTE THEIR L2 NORM
|
||||
# plugin CADD,/opt/vep/.vep/CADD/GRCh38/whole_genome_SNVs.tsv.gz,/opt/vep/.vep/CADD/GRCh38/InDels.tsv.gz
|
||||
|
||||
|
||||
# Clinical significance
|
||||
check_existing
|
||||
clinvar
|
||||
|
||||
# Allele frequencies
|
||||
af
|
||||
af_1kg
|
||||
af_gnomade
|
||||
af_gnomadg
|
||||
max_af
|
||||
|
||||
# Additional useful flags
|
||||
numbers
|
||||
hgvs
|
||||
symbol
|
||||
canonical
|
||||
Reference in New Issue
Block a user