From 9e6a16c19b7fde9cfbfbc53638c90073d65ad434 Mon Sep 17 00:00:00 2001 From: Olamide Isreal Date: Thu, 26 Mar 2026 15:15:23 +0100 Subject: [PATCH] Initial commit: digital-patients pipeline (clean, no large files) Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images. --- .gitattributes | 4 + .gitignore | 24 + .gitmodules | 3 + Dockerfile_borzoi | 92 ++ Dockerfile_cibersortx | 126 +++ Dockerfile_corto | 49 + Dockerfile_rna2proteinexpression | 79 ++ Dockerfile_synthea | 64 ++ Dockerfile_vcf2prot | 96 ++ Dockerfile_vep | 2 + Download_write_healthy_m_f_txt_file.ipynb | 302 +++++ Notes.txt | 7 + README.md | 210 ++++ app_filter/README.md | 5 + app_filter/filter_transcriptome.py | 56 + app_filter/vep.ini | 32 + create_LM22_sourceGEP_ref_file.py | 11 + docker-compose.yml | 48 + documentaion/README.md | 1241 +++++++++++++++++++++ ensg2number.joblib | Bin 0 -> 124367 bytes k8s/README.md | 15 + k8s/deployment-worksplace.yaml | 72 ++ k8s/gpu-test.yaml | 18 + k8s/job-copy-node-to-pvc.yaml | 54 + k8s/job-nextflow-digital-patient.yaml | 52 + main_borzoi.nf | 448 ++++++++ main_cibersortx.nf | 260 +++++ main_corto.nf | 46 + main_filter_outputs.nf | 426 +++++++ main_no_mutations.nf | 208 ++++ main_rna2proteinexpression.nf | 24 + main_synthea.nf | 540 +++++++++ main_synthea.nf.bk | 481 ++++++++ main_synthea.nf.bk.2 | 515 +++++++++ main_vcf2prot.nf | 65 ++ ncbiRefSeq_bigger.csv | 289 +++++ nextflow.config | 98 ++ params.json | 325 ++++++ prot_bigger.csv | 229 ++++ rna2protexpression.py | 133 +++ test.nf | 217 ++++ test.nf.bk | 102 ++ test_gen_patient.nf | 125 +++ test_no_mutations.nf | 14 + tissue2number.joblib | Bin 0 -> 952 bytes 45 files changed, 7207 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Dockerfile_borzoi create mode 100644 Dockerfile_cibersortx create mode 100644 Dockerfile_corto create mode 100644 Dockerfile_rna2proteinexpression create mode 100644 Dockerfile_synthea create mode 100644 Dockerfile_vcf2prot create mode 100644 Dockerfile_vep create mode 100644 Download_write_healthy_m_f_txt_file.ipynb create mode 100644 Notes.txt create mode 100644 README.md create mode 100644 app_filter/README.md create mode 100644 app_filter/filter_transcriptome.py create mode 100644 app_filter/vep.ini create mode 100644 create_LM22_sourceGEP_ref_file.py create mode 100644 docker-compose.yml create mode 100644 documentaion/README.md create mode 100644 ensg2number.joblib create mode 100644 k8s/README.md create mode 100644 k8s/deployment-worksplace.yaml create mode 100644 k8s/gpu-test.yaml create mode 100644 k8s/job-copy-node-to-pvc.yaml create mode 100644 k8s/job-nextflow-digital-patient.yaml create mode 100644 main_borzoi.nf create mode 100644 main_cibersortx.nf create mode 100644 main_corto.nf create mode 100644 main_filter_outputs.nf create mode 100644 main_no_mutations.nf create mode 100644 main_rna2proteinexpression.nf create mode 100644 main_synthea.nf create mode 100644 main_synthea.nf.bk create mode 100644 main_synthea.nf.bk.2 create mode 100644 main_vcf2prot.nf create mode 100644 ncbiRefSeq_bigger.csv create mode 100644 nextflow.config create mode 100644 params.json create mode 100644 prot_bigger.csv create mode 100644 rna2protexpression.py create mode 100644 test.nf create mode 100644 test.nf.bk create mode 100644 test_gen_patient.nf create mode 100644 test_no_mutations.nf create mode 100644 tissue2number.joblib diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..87bb4b3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +gnomad.genomes.v4.1.sites.female.txt filter=lfs diff=lfs merge=lfs -text +gnomad.genomes.v4.1.sites.male.txt filter=lfs diff=lfs merge=lfs -text +gnomad.genomes.v4.1.sites.* filter=lfs diff=lfs merge=lfs -text +app_filter/Cosmic_GenomeScreensMutant_v103_GRCh38.vcf.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38515a0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +/work +.nextflow +.nextflow.* +sample/ +*.swp +*.pid +*.log + +# Large files (baked into Docker images or staged to S3) +ensemble_reference.fasta +go_df_work.csv +MANE_transcipts_reference.fasta +reference.fasta +MANE.GRCh38.v1.3.update.tsv +go_term_protein_expression_model.pth +CIBERSORTx-fractionshires/ +TPM_NO_MUTATIONS.csv +name_transcipt_map.csv +LM22_sourceGEP_ensg.txt +regulon.rda +impute-me_wget_ukbb_variants.sh +ncbiRefSeq_subset.pickle +prot_subset.pickle +app_filter/Cosmic_GenomeScreensMutant_v103_GRCh38.vcf.gz \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ba95e96 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ecotyper"] + path = ecotyper + url = https://gitlab.com/omic/next/registry/tools/ecotyper diff --git a/Dockerfile_borzoi b/Dockerfile_borzoi new file mode 100644 index 0000000..04cc8dc --- /dev/null +++ b/Dockerfile_borzoi @@ -0,0 +1,92 @@ +FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 + +USER root + +SHELL ["/bin/bash", "-c"] +WORKDIR /home +RUN mkdir -p /home/omic +WORKDIR /home/omic +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + curl \ + git \ + wget \ + cuda-toolkit-12-2 \ + ca-certificates \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh \ + && bash miniconda.sh -b -p /opt/conda \ + && rm miniconda.sh \ + && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ + && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ + && echo "conda activate base" >> ~/.bashrc \ + && find /opt/conda/ -follow -type f -name '*.a' -delete \ + && find /opt/conda/ -follow -type f -name '*.js.map' -delete \ + && /opt/conda/bin/conda clean -afy + +ENV PATH /opt/conda/bin:$PATH + +# main conda env (borzoi) +RUN conda create -n borzoi python=3.9 +ENV PATH "$PATH:/opt/conda/envs/borzoi/bin" +RUN echo "source activate borzoi" >> ~/.bashrc +RUN conda clean --all -f -y + +# Install borzoi +RUN git clone https://github.com/calico/baskerville.git +WORKDIR /home/omic/baskerville +RUN /opt/conda/envs/borzoi/bin/python3 -m pip install -e . +WORKDIR /home/omic +RUN git clone https://github.com/calico/borzoi.git +WORKDIR /home/omic/borzoi +RUN /opt/conda/envs/borzoi/bin/python3 -m pip install -e . + +# Install pretrained models +RUN mkdir -p /home/omic/borzoi/saved_models +WORKDIR /home/omic/borzoi/saved_models +RUN mkdir -p f0 && mkdir -p f1 && mkdir -p f2 && mkdir -p f3 +RUN wget --no-check-certificate -O /home/omic/borzoi/saved_models/f0/model0_best.h5 https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5 +RUN wget --no-check-certificate -O /home/omic/borzoi/saved_models/f1/model0_best.h5 https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5 +RUN wget --no-check-certificate -O /home/omic/borzoi/saved_models/f2/model0_best.h5 https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5 +RUN wget --no-check-certificate -O /home/omic/borzoi/saved_models/f3/model0_best.h5 https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5 + +WORKDIR /home/omic/borzoi + +# Get hg38 +RUN wget -O - https://hgdownload2.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip -c > hg38.fa + +# Get other supporting data +RUN wget -O - https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_nort.gtf.gz | gunzip -c > gencode41_basic_nort.gtf +RUN wget https://storage.googleapis.com/seqnn-share/helper/gencode41_basic_protein_splice.csv.gz +RUN wget https://storage.googleapis.com/seqnn-share/helper/polyadb_human_v3.csv.gz + +# Index the reference +RUN /opt/conda/envs/borzoi/bin/python3 -m pip install pyfaidx +RUN /opt/conda/envs/borzoi/bin/python3 -c "import pyfaidx; pyfaidx.Faidx('hg38.fa')" + +#Install samtools +RUN apt-get update -y && apt-get install -y libz-dev liblzma-dev libbz2-dev libcurl4-gnutls-dev +WORKDIR /home/omic +RUN git clone --recurse-submodules https://github.com/samtools/htslib.git +RUN git clone https://github.com/samtools/bcftools.git +WORKDIR /home/omic/bcftools +# The following is optional: +#RUN autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters +RUN make +RUN export BCFTOOLS_PLUGINS=/home/omic/bcftools + +WORKDIR /home/omic/borzoi + +COPY ncbiRefSeq_bigger.csv . +COPY ncbiRefSeq_subset.pickle . + +#copy MANE data +COPY prot_bigger.csv . +COPY prot_subset.pickle . +COPY TPM_NO_MUTATIONS.csv . +# test diff --git a/Dockerfile_cibersortx b/Dockerfile_cibersortx new file mode 100644 index 0000000..1d1b6c3 --- /dev/null +++ b/Dockerfile_cibersortx @@ -0,0 +1,126 @@ +# Start with Ubuntu base +FROM ubuntu:22.04 + +USER root +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Etc/UTC \ + PATH="/opt/conda/bin:/opt/conda/condabin:/opt/conda/envs/ecotyper/bin:$PATH" \ + R_LIBS="/opt/conda/envs/ecotyper/lib/R/library:/ecotyper/R/library" \ + ECOTYPER_HOME="/ecotyper" + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + build-essential \ + r-base \ + r-base-dev \ + libcairo2-dev \ + libxt-dev \ + libgraphicsmagick1-dev \ + default-jdk \ + xvfb \ + xauth \ + xfonts-base \ + && rm -rf /var/lib/apt/lists/* + +# Set up R environment +RUN echo "options(repos = c(CRAN = 'https://cloud.r-project.org'))" > /root/.Rprofile + +# Install Conda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + mkdir -p /opt && \ + bash miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc && \ + find /opt/conda/ -follow -type f -name '*.a' -delete && \ + find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ + /opt/conda/bin/conda clean -afy + +ENV PATH=/opt/conda/bin:$PATH + +# Update conda and create environment +RUN conda update -y -n base -c defaults conda && \ + conda create -n ecotyper -c conda-forge -c bioconda \ + python=3.8 \ + r-base=4.1.3 \ + r-matrix \ + r-mass \ + r-nmf \ + r-rcpp \ + r-data.table \ + r-reshape2 \ + r-plyr \ + r-stringr \ + r-ggplot2 \ + r-rcolorbrewer \ + r-circlize \ + r-cowplot \ + r-viridis \ + r-gridextra \ + r-ggpubr \ + r-cluster \ + r-matrixtests \ + r-doparallel \ + r-foreach \ + r-optparse \ + r-argparse \ + r-config \ + r-colorspace \ + r-rjson \ + bioconductor-biobase \ + bioconductor-complexheatmap \ + bioconductor-genomicranges \ + -y && \ + conda clean -afy + +# Clone EcoTyper and setup directory structure +RUN git clone --depth 1 https://github.com/digitalcytometry/ecotyper.git /ecotyper && \ + cd /ecotyper && \ + mkdir -p EcoTyper/Carcinoma/Carcinoma_Fractions/Analysis/rank_selection && \ + mkdir -p EcoTyper/Lymphoma/Lymphoma_Fractions/Analysis/rank_selection && \ + mkdir -p EcoTyper/Carcinoma/Carcinoma_Fractions/Cell_States/recovery && \ + mkdir -p EcoTyper/Carcinoma/Carcinoma_Fractions/Ecotypes/recovery && \ + mkdir -p EcoTyper/Lymphoma/Lymphoma_Fractions/Cell_States/recovery && \ + mkdir -p EcoTyper/Lymphoma/Lymphoma_Fractions/Ecotypes/recovery && \ + chmod -R +x pipeline/*.R && \ + chmod -R +x *.R + +# Setup EcoTyper permissions +RUN cd /ecotyper && \ + ln -sf /ecotyper/pipeline pipeline && \ + chmod -R 755 /ecotyper/EcoTyper && \ + chmod -R 755 /ecotyper/pipeline + +# Setup CIBERSORTx +RUN mkdir -p /src +WORKDIR /src + +# Copy CIBERSORTx files +COPY ./CIBERSORTx-fractionshires/ /src/ + +# Install R dependencies for CIBERSORTx +RUN R -e "if (!require('BiocManager', quietly=TRUE)) install.packages('BiocManager', repos='https://cloud.r-project.org/')" && \ + Rscript /src/install_R_dependencies.R + +# Add src to PATH +ENV PATH="/src:${PATH}" + +# Setup final environment +RUN echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate ecotyper" >> ~/.bashrc + +#Add pandas +RUN conda install -n ecotyper anaconda::pandas + +# Set default command +CMD ["/bin/bash"] diff --git a/Dockerfile_corto b/Dockerfile_corto new file mode 100644 index 0000000..212e875 --- /dev/null +++ b/Dockerfile_corto @@ -0,0 +1,49 @@ +FROM r-base:latest + +WORKDIR /usr/src/app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + libcurl4-gnutls-dev \ + libssl-dev \ + libxml2-dev \ + procps # Adding procps for the 'ps' command + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh \ + && bash miniconda.sh -b -p /opt/conda \ + && rm miniconda.sh \ + && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ + && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ + && echo "conda activate base" >> ~/.bashrc \ + && find /opt/conda/ -follow -type f -name '*.a' -delete \ + && find /opt/conda/ -follow -type f -name '*.js.map' -delete \ + && /opt/conda/bin/conda clean -afy + +ENV PATH /opt/conda/bin:$PATH +RUN conda update -y -n base -c defaults conda + +# main conda env (rpy2) +RUN conda create -n rpy2 python=3.9 +ENV PATH "$PATH:/opt/conda/envs/rpy2/bin" +RUN echo "source activate rpy2" >> ~/.bashrc +RUN conda clean --all -f -y + +RUN conda install -y -n rpy2 -c anaconda pandas +RUN conda install -y -n rpy2 -c anaconda numpy + +# Install R packages +RUN Rscript -e "install.packages('remotes', dependencies=TRUE)" \ + && Rscript -e "remotes::install_github('federicogiorgi/corto')" \ + && Rscript -e "install.packages('data.table', dependencies=TRUE)" \ + && Rscript -e "install.packages('BiocManager')" \ + && Rscript -e "BiocManager::install('DESeq2')" + +# Test commands to validate installation +RUN Rscript -e "library(corto)" + +# Clone the corto repository (optional, based on your needs) +RUN git clone https://github.com/federicogiorgi/corto.git + +CMD ["/bin/bash"] + diff --git a/Dockerfile_rna2proteinexpression b/Dockerfile_rna2proteinexpression new file mode 100644 index 0000000..4134e6e --- /dev/null +++ b/Dockerfile_rna2proteinexpression @@ -0,0 +1,79 @@ +ARG CUDA=11.7 +FROM nvidia/cuda:${CUDA}.1-cudnn8-devel-ubuntu22.04 + +USER root +SHELL ["/bin/bash", "-c"] + +WORKDIR /home +RUN mkdir -p /home/omic +WORKDIR /home/omic + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + curl \ + git \ + wget \ + ca-certificates \ + hmmer \ + kalign \ + tzdata \ + python3-pip \ + python3-dev \ + python3-opencv \ + libglib2.0-0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Add the NVIDIA GPG key directly +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb +RUN dpkg -i cuda-keyring_1.0-1_all.deb && rm cuda-keyring_1.0-1_all.deb +RUN apt-get -y update && \ + apt-get install -y --no-install-recommends cuda-command-line-tools-11-7 + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh \ + && bash miniconda.sh -b -p /opt/conda \ + && rm miniconda.sh \ + && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ + && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ + && echo "conda activate base" >> ~/.bashrc \ + && find /opt/conda/ -follow -type f -name '*.a' -delete \ + && find /opt/conda/ -follow -type f -name '*.js.map' -delete \ + && /opt/conda/bin/conda clean -afy + +ENV PATH /opt/conda/bin:$PATH +RUN conda update -y -n base -c defaults conda + +# main conda env (rna2protexpresson) +RUN conda create -n rna2protexpresson python=3.9 +ENV PATH "$PATH:/opt/conda/envs/rna2protexpresson/bin" +RUN echo "source activate rna2protexpresson" >> ~/.bashrc +RUN conda clean --all -f -y + +# Upgrade apt-get +RUN apt-get -y update && apt-get install -y ca-certificates && update-ca-certificates + +# Upgrade pip +RUN python3 -m pip install --upgrade pip + +# Install PyTorch and torchvision +RUN /opt/conda/envs/rna2protexpresson/bin/python3 -m pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu111/torch_stable.html +#Inatall packages +RUN /opt/conda/envs/rna2protexpresson/bin/python3 -m pip install axial_attention thefuzz +RUN conda install -n rna2protexpresson -c anaconda pandas numpy scikit-learn joblib +RUN /opt/conda/envs/rna2protexpresson/bin/python3 -m pip install argparse + +# make rna2protexpresson +RUN mkdir -p /home/omic/rna2protexpresson +WORKDIR /home/omic/rna2protexpresson + +# Package into python script for running in nextflow +COPY rna2protexpression.py /home/omic/rna2protexpression/rna2protexpression.py +RUN chmod +x /home/omic/rna2protexpression/rna2protexpression.py + +#COPY model &˛ preprocessing +COPY tissue2number.joblib . +COPY ensg2number.joblib . +COPY go_df_work.csv . +COPY go_term_protein_expression_model.pth . diff --git a/Dockerfile_synthea b/Dockerfile_synthea new file mode 100644 index 0000000..8ac6295 --- /dev/null +++ b/Dockerfile_synthea @@ -0,0 +1,64 @@ +FROM ubuntu:latest + +ARG DEBIAN_FRONTEND=noninteractive +USER root + +# Install gradle and JDK +RUN apt -y update && apt-get -y update +RUN apt install -y build-essential openjdk-8-jdk wget git python3 python3-pip tabix + +# install conda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + mkdir -p /opt && \ + sh miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc && \ + find /opt/conda/ -follow -type f -name '*.a' -delete && \ + find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ + /opt/conda/bin/conda clean -afy + +ENV PATH /opt/conda/bin:$PATH + +# main conda env (synthea) +RUN conda create -n synthea +ENV PATH "$PATH:/opt/conda/envs/synthea/bin" +RUN echo "source activate synthea" >> ~/.bashrc + +# Install pandas and numpy +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install pandas numpy fuzzywuzzy python-Levenshtein requests + +# Install synthea from latest release +RUN wget https://github.com/synthetichealth/synthea/archive/refs/tags/master-branch-latest.tar.gz +RUN tar xzf master-branch-latest.tar.gz && mv synthea-master-branch-latest synthea +WORKDIR /synthea + +ARG PATH $PATH:/synthea +ENV PATH $PATH:/synthea + +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge + +# Install samtools +RUN conda install -n synthea -c bioconda samtools + +# Install UCSC Liftover +RUN conda install -n synthea -c bioconda ucsc-liftover +RUN conda install -n synthea -c bioconda gatk4 +RUN conda install -n synthea -c bioconda bedtools +RUN apt-get -y install vcftools + +# DNA to Protein Sequence code +WORKDIR /synthea +RUN git clone https://github.com/prestevez/dna2proteins.git +RUN mv dna2proteins/dna2proteins.py . +RUN rm -r dna2proteins +RUN apt install -y 2to3 +RUN 2to3 -w dna2proteins.py + +#Test +# RUN ./gradlew build check test +RUN ./run_synthea -p 10 --exporter.csv.export true +# use -a for age range e.g. 30-40 and -g for gender M/F diff --git a/Dockerfile_vcf2prot b/Dockerfile_vcf2prot new file mode 100644 index 0000000..ac91d7c --- /dev/null +++ b/Dockerfile_vcf2prot @@ -0,0 +1,96 @@ +#FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 +FROM rust:1.61 + +USER root + +SHELL ["/bin/bash", "-c"] +#SHELL [ "/bin/sh", "-s", ".", "$HOME/.cargo/env" ] +WORKDIR /home +RUN mkdir -p /home/omic +WORKDIR /home/omic +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y && apt-get install -y software-properties-common --no-install-recommends \ + build-essential \ + cmake \ + curl \ + git \ + wget \ + ca-certificates \ + libopenblas-dev \ + libgsl-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Get Rust +#RUN curl https://sh.rustup.rs -sSf | bash -s -- --default-toolchain nightly -y +#RUN source $HOME/.cargo/env +#RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc + +# Install vcf2prot +RUN git clone https://github.com/ikmb/vcf2prot.git +WORKDIR /home/omic/vcf2prot + +RUN cargo build --release + +WORKDIR /home/omic/vcf2prot/target/release + +RUN export DEBUG_CPU_EXEC=TRUE +RUN export INSPECT_TXP=TRUE +RUN export INSPECT_INS_GEN=TRUE + +#Test +RUN ./vcf2prot -h + +WORKDIR /home/omic/vcf2prot +RUN gunzip examples/reference_sequences.fasta.gz + +COPY reference.fasta . + +#install bcftools +WORKDIR /home/omic/ + +# install conda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ + mkdir -p /opt && \ + bash miniconda.sh -b -p /opt/conda && \ + rm miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc && \ + find /opt/conda/ -follow -type f -name '*.a' -delete && \ + find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ + /opt/conda/bin/conda clean -afy + +ENV PATH /opt/conda/bin:$PATH + +## update conda +RUN conda update -y -n base -c defaults conda + +# main conda env (vcf2prot) +RUN conda create -n vcf2prot python=3.9 +ENV PATH "$PATH:/opt/conda/envs/vcf2prot/bin" +RUN echo "source activate vcf2prot" >> ~/.bashrc + +RUN conda install -y -n vcf2prot -c bioconda openssl +RUN conda install -y -n vcf2prot -c conda-forge libgcc-ng +RUN conda install -y -n vcf2prot -c bioconda bcftools==1.20 + +#Install beagle +#RUN wget https://faculty.washington.edu/browning/beagle/beagle.28Jun21.220.jar + +#Get data +RUN wget https://ftp.ensembl.org/pub/release-112/gff3/homo_sapiens/Homo_sapiens.GRCh38.112.gff3.gz +RUN gunzip Homo_sapiens.GRCh38.112.gff3.gz +#RUN wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz +#RUN gunzip hg38.fa.gz +RUN wget https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz +RUN gunzip Homo_sapiens.GRCh38.dna.toplevel.fa.gz + +WORKDIR /home/omic/vcf2prot + + +RUN conda install -y -n vcf2prot numpy pandas + + +COPY MANE_transcipts_reference.fasta . diff --git a/Dockerfile_vep b/Dockerfile_vep new file mode 100644 index 0000000..dfc402d --- /dev/null +++ b/Dockerfile_vep @@ -0,0 +1,2 @@ +FROM ensemblorg/ensembl-vep:release_115.2 + diff --git a/Download_write_healthy_m_f_txt_file.ipynb b/Download_write_healthy_m_f_txt_file.ipynb new file mode 100644 index 0000000..1866a4d --- /dev/null +++ b/Download_write_healthy_m_f_txt_file.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "e2d83e35-e69f-456e-a2fb-77a05b42f43a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import io\n", + "import os\n", + "import pandas as pd\n", + "import gzip\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "900349f6-d73f-4584-b949-42c1f770d696", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#create names of all chromosome\n", + "chr_list = [str(i+1) for i in range(22)]\n", + "chr_list.extend(['X','Y'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0bca1990-db14-4f99-a57a-1132515fa21f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-11-04 13:33:40-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr15.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 16904431253 (16G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 15.74G 78.2MB/s in 4m 34s \n", + "\n", + "2024-11-04 13:38:15 (58.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’ saved [16904431253/16904431253]\n", + "\n", + "--2024-11-04 13:38:15-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr16.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.217.123, 142.251.215.251, 172.217.14.251, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.217.123|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 18918703003 (18G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 17.62G 77.1MB/s in 4m 7s \n", + "\n", + "2024-11-04 13:42:23 (73.1 MB/s) - ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’ saved [18918703003/18918703003]\n", + "\n", + "--2024-11-04 13:42:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr17.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.251, 142.250.69.219, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 17412317144 (16G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 16.22G 67.9MB/s in 4m 1s \n", + "\n", + "2024-11-04 13:46:24 (68.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’ saved [17412317144/17412317144]\n", + "\n", + "--2024-11-04 13:46:25-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14430177524 (13G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 13.44G 72.8MB/s in 3m 19s \n", + "\n", + "2024-11-04 13:49:45 (69.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’ saved [14430177524/14430177524]\n", + "\n", + "--2024-11-04 13:49:45-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr19.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.69.219, 142.251.33.123, 142.251.211.251, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.69.219|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 13679473477 (13G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 12.74G 74.2MB/s in 3m 1s \n", + "\n", + "2024-11-04 13:52:46 (72.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’ saved [13679473477/13679473477]\n", + "\n", + "--2024-11-04 13:52:47-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr20.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 11838569571 (11G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 11.03G 79.9MB/s in 2m 38s \n", + "\n", + "2024-11-04 13:55:26 (71.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’ saved [11838569571/11838569571]\n", + "\n", + "--2024-11-04 13:55:26-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr21.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 7758981978 (7.2G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 7.23G 72.4MB/s in 1m 50s \n", + "\n", + "2024-11-04 13:57:16 (67.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’ saved [7758981978/7758981978]\n", + "\n", + "--2024-11-04 13:57:16-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr22.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.219, 142.250.69.219, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 8731339280 (8.1G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 8.13G 81.2MB/s in 2m 0s \n", + "\n", + "2024-11-04 13:59:16 (69.7 MB/s) - ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’ saved [8731339280/8731339280]\n", + "\n", + "--2024-11-04 13:59:17-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrX.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 22908335319 (21G) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 21.33G 76.7MB/s in 5m 6s \n", + "\n", + "2024-11-04 14:04:23 (71.5 MB/s) - ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’ saved [22908335319/22908335319]\n", + "\n", + "--2024-11-04 14:04:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrY.vcf.bgz\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.14.219, 142.250.69.219, 142.251.33.123, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.14.219|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 599108494 (571M) [application/octet-stream]\n", + "Saving to: ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’\n", + "\n", + "gnomad.genomes.v4.1 100%[===================>] 571.35M 78.3MB/s in 8.9s \n", + "\n", + "2024-11-04 14:04:32 (64.4 MB/s) - ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’ saved [599108494/599108494]\n", + "\n" + ] + } + ], + "source": [ + "#download gnomad vcf v4\n", + "for i in chr_list: \n", + " !wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr{i}.vcf.bgz\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "68ba24a1-d60d-4097-8827-7599a1cc705d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#rewrite files to only extract alleles with frequency above freq_filter\n", + "\n", + "def rewrite(chr_name, freq_filter):\n", + " file_neme = f'gnomad.genomes.v4.1.sites.chr{chr_name}.vcf.bgz'\n", + " file_male_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.male.txt'\n", + " file_female_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.female.txt'\n", + " \n", + " #delete old male female files if they exist\n", + " try:\n", + " os.remove(file_male_name)\n", + " except OSError:\n", + " pass\n", + " try:\n", + " os.remove(file_female_name)\n", + " except OSError:\n", + " pass\n", + "\n", + " with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n", + " m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n", + " f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n", + " with gzip.open(file_neme, 'r') as f:\n", + " for l in f:\n", + " if not str(l)[2:3]=='#':\n", + " vcf_pos = str(l)[2:-1]\n", + " pos = vcf_pos.split('\\\\t')\n", + " info = pos[-1].split(';')\n", + " pos = pos[:6]\n", + " af_XX = list(np.array(info)[[i[:5]=='AF_XX' for i in info]])\n", + " af_XY = list(np.array(info)[[i[:5]=='AF_XY' for i in info]])\n", + " #some positions don't have allele frequency\n", + " if len(af_XX)==1:\n", + " assert af_XX[0][:5]=='AF_XX'\n", + " af_XX = af_XX[0].split('=')[1] \n", + " if float(af_XX)>=freq_filter:\n", + " f_file.write((',').join(pos)+','+af_XX+'\\n')\n", + " if len(af_XY)==1:\n", + " assert af_XY[0][:5]=='AF_XY'\n", + " af_XY = af_XY[0].split('=')[1] \n", + " if float(af_XY)>=freq_filter:\n", + " m_file.write((',').join(pos)+','+af_XY+'\\n')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fdac0b3-f8f3-4540-b050-408672fee807", + "metadata": {}, + "outputs": [], + "source": [ + "freq_filter = 0.005\n", + "[rewrite(i, freq_filter) for i in chr_list]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fb349e47-38d4-4fb2-af02-9a522c64b3fc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#concat all chr to one file\n", + "def concat_all_chr_to_m_f(chr_list):\n", + " file_male_name = f'gnomad.genomes.v4.1.sites.male.txt'\n", + " file_female_name = f'gnomad.genomes.v4.1.sites.female.txt'\n", + " \n", + " #delete old male female files if they exist\n", + " try:\n", + " os.remove(file_male_name)\n", + " except OSError:\n", + " pass\n", + " try:\n", + " os.remove(file_female_name)\n", + " except OSError:\n", + " pass\n", + " \n", + " with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n", + " m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n", + " f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n", + " for chr_n in chr_list:\n", + " file_male=f'gnomad.genomes.v4.1.sites.chr{chr_n}.male.txt'\n", + " with open(file_male) as infile:\n", + " next(infile)\n", + " for line in infile:\n", + " m_file.write(line)\n", + " if chr_n != 'Y':\n", + " file_female=f'gnomad.genomes.v4.1.sites.chr{chr_n}.female.txt'\n", + " with open(file_female) as infile:\n", + " next(infile)\n", + " for line in infile:\n", + " f_file.write(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e170368a-9738-49af-9898-ac85bbe72385", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "concat_all_chr_to_m_f(chr_list)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notes.txt b/Notes.txt new file mode 100644 index 0000000..2f8aac6 --- /dev/null +++ b/Notes.txt @@ -0,0 +1,7 @@ +add rna splice step + +fix borzoi - muatation step (main_borzoi;PREDICT_EXPRESSION;create_mut()) onyl POINT MUTATIONS add INDEL MUATAIONS + +Borzoi can have alternativ chromosoms and chrUn_* as well, VCF2PROT CAN NOT have them. This will cause discrepancies in the code. + +Name output files from borzoi ==> need to have patients name/id diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3970c1 --- /dev/null +++ b/README.md @@ -0,0 +1,210 @@ +# Digital Patient and Drug Response Pipeline - Comprehensive Implementation Plan + +## Pipeline Overview +```mermaid +flowchart TB + subgraph Patient["Patient Profile Generation"] + A1[1. Medical Records Creation] --> A2[2. Disease-specific Genome] + A2 --> A3[3. Disease-specific Protein Variants] + A3 --> A4[4. Disease-specific Transcriptome] + A4 --> A5[5. Disease-specific Proteome] + A5 --> A6[6. Disease-specific Metabolome] + A6 --> A7[7. Disease-specific Immunome] + end + + subgraph Drug["Drug Analysis & Modeling"] + B1[8. Drug-Target PK] --> B1a[8a. Binding Site Prediction] + B1a --> B1b[8b. Drug-Target Docking] + B1b --> B2[9. Drug-Proteome Screening] + B2 --> B3[10. Off-target Analysis] + B3 --> B4[11. Drug-Compound Screening] + B4 --> B5[12. Drug-Genome Sensitivity] + end + + subgraph Response["Response Prediction"] + C1[13. Transcriptomic Changes] --> C2[14. Disease Stage Evaluation] + C2 --> C3[15-16. Proteomic & Metabolomic Changes] + C3 --> C4[17-19. Biological & Immune Response] + C4 --> C5[20-21. ADMET & Toxicity] + end + + Patient --> Drug + Drug --> Response +``` + +## Part 1: Digital Patient Generation + +### Implementation Status Overview + +| Step | Status | Tool/Method | Input | Output | Location | Validation Data | Dependencies/Notes | +|------|--------|-------------|--------|---------|-----------|-----------------|-------------------| +| 1. Medical Records | ✓ | Synthea | - | Demographics, records | /Workspace/next/registry/tools/synthea | Target: 1000 patients/disease | - | +| 2. Disease Genome | ✓ | Omic-UKBB | Alleles, positions, frequencies | VCF (hg38) | Part of Synthea repo | - | Only storing variants | +| 3. Protein Variants | ✓ | vcf2prot | VCF | Protein fasta | Part of Synthea repo (tbc) | - | Multi-tissue support needed | +| 4. Transcriptome | 🚧 | borzoi | Genomic sequences | RNAseq (TPM) | - | ENCODE, GTEx (E-MTAB-6814) | NIH ENCODE standards | +| 5. Proteome | 🚧 | clei2block | RNAseq (log2-FC) | Fold-change | github.com/stasaki/clei2block | CellModelPassport, TCGA | Requires GTEx training | +| 6. Metabolome | ⏳ | corto | RNAseq (TPM) | Metabolite profiles | github.com/federicogiorgi/corto | CCLE, NCI-60 | - | +| 7. Immunome | ⏳ | Ecotyper | RNAseq | Cell type profiles | - | SPICA30, SPICA17 | - | + +### Active Implementation Tasks + +#### Transcriptome Generation +Current goal: Establish accurate transcriptome prediction pipeline +- [x] Implement and evaluate primary models: + * ~~enformer~~ + * ~~basenji~~ + * borzoi ~~for RNAseq profiles~~, built on basenji and enformer +- [x] Add SequenceModelBenchmark ridge regression - built into borzoi (tbc) +- [ ] Validate against ENCODE standards +- [ ] Implement GTEx validation pipeline + +#### Multi-omic Integration +Current goal: Create robust data transformation pipeline +- [ ] Proteome prediction (clei2block): + * Implement GTEx training pipeline + * Add multi-tissue support + * Create validation framework against CellModelPassport +- [ ] Metabolome generation (corto): + * Setup CCLE data integration + * Implement NCI-60 validation +- [ ] Immunome profiling: + * ~~Evaluate Ecotyper vs CIBERTSORTx~~ CIBERTSORTx incorporated within Ecotyper + * Integrate SPICA datasets + * Setup immune cell validation pipeline + +## Part 2: Drug Discovery and Response + +### Drug Development Tools +Current goal: Establish comprehensive drug analysis pipeline +- [ ] Molecule Processing: + * SELFIES library for biologics/peptides conversion + * Implement molecule validation checks + * Setup standardization pipeline +- [ ] Structure Analysis: + * DreamDock + ConPlex score pipeline + * LightDock for membrane binding + * Validation framework with crystal structures + +### Binding Site Prediction +Current goal: Create consensus model for binding site prediction +- [ ] Benchmark tools: + * DiffDock implementation and testing + * Qvina2 evaluation + * P2Rank integration + * FPocket analysis +- [ ] Specific considerations: + * Allosteric site detection + * Multiple binding site handling + * Protein flexibility modeling +- [ ] Validation: + * BindingDB integration + * Crystal structure comparison pipeline + * Edge case testing suite + +### Drug-Target Analysis +Current goal: Robust docking and interaction prediction +- [ ] Primary docking pipeline: + * Uni-mol integration + * DreamDock implementation + * Path4Drug integration for pathways +- [ ] Molecule type-specific handling: + * Small molecule pipeline + * Biologics pathway + * PROTACs specific analysis + * Prodrug processing +- [ ] Interaction analysis: + * Agonist vs antagonist classification + * Protein-protein interaction integration + * Chemical_checker for bioactivity signatures + +### Chemical Property Prediction +Current goal: Comprehensive property prediction system +- [ ] Model implementation: + * Chemprop evaluation + * Soltrannet integration + * Custom ADMET model development +- [ ] Property coverage: + * Solubility prediction + * BBB penetration + * Chemical stability + * Metabolic processing + +### Toxicity Prediction Pipeline +Current goal: Multi-faceted toxicity assessment system +- [ ] Core modules: + * Cardiotoxicity (ion channel) prediction + * Hepatotoxicity (Phase 1/2 proteins) + * Nephrotoxicity assessment + * Lung toxicity prediction + * Neurotoxicity (BBB criteria) + * Inflammatory response modeling + * Bleeding/clotting risk analysis +- [ ] Integration components: + * Human Protein Atlas tissue proportion estimation + * Reactome pathway analysis + * Industry model benchmarking + +### Drug Response Analysis +Current goal: Integrated response prediction system +- [ ] Transcriptomic response: + * LINCS data integration + * Expression change prediction + * Tissue-specific effects +- [ ] Multi-omic response: + * Proteomic change modeling + * Metabolomic adjustment prediction + * Immune response profiling +- [ ] Special cases: + * Multi-drug combinations + * Time-dependent effects + * Population-specific responses + +## Critical Dependencies & Requirements + +| Category | Component | Status | Notes | +|----------|-----------|---------|--------| +| **External Data** | BindingDB | ✓ Available | Binding affinities | +| | LINCS | ✓ Available | Compound effects | +| | PharmGKB | ⏳ Pending | Variant annotations | +| | Human Cell Atlas | ⏳ Pending | Tissue-specific data | +| **Compute** | GPU Cluster | 🚧 Scaling | For enformer/basenji | +| | Storage | ✓ Configured | For variant data | +| | Distribution | ⏳ Planned | For processing | + +## Validation Framework + +| Dataset | Usage | Status | Notes | +|---------|--------|---------|--------| +| ENCODE | Transcriptomics | ✓ Ready | Primary validation | +| GTEx | Tissue-specific | ✓ Ready | E-MTAB-6814 | +| CCLE/GDSC2 | Cell lines | 🚧 In Progress | Cancer validation | +| TDC | ADMET | ⏳ Planned | Benchmark data | +| Cross-species | Conservation | ⏳ Planned | Evolutionary validation | +| Time-series | Metabolics | ⏳ Planned | Kinetic validation | + +## Edge Cases & Special Considerations + +### Complex Scenarios +| Scenario | Implementation Status | Handling Strategy | +|----------|---------------------|-------------------| +| Rare variants | 🚧 In Progress | Population frequency weighting | +| Multi-drug combinations | ⏳ Planned | Interaction matrix modeling | +| Time-dependent effects | ⏳ Planned | PK/PD time series modeling | +| Population specificity | 🚧 In Progress | Demographic stratification | + +### Special Drug Classes +| Class | Special Requirements | Status | +|-------|---------------------|---------| +| Biologics | Membrane modeling, immunogenicity | ⏳ Planned | +| Prodrugs | Metabolite prediction, activation | 🚧 In Progress | +| Combination therapy | Interaction prediction, timing | ⏳ Planned | +| PROTACs | Protein degradation modeling | ⏳ Planned | + +## Case Studies & Validation Examples + +| Drug | Outcome | Learning Points | Implementation Status | +|------|---------|----------------|----------------------| +| Amcenestrant | Efficacy failure | Target validation importance | ✓ Integrated | +| Flupirtine | Liver toxicity | Metabolite prediction crucial | 🚧 In Progress | +| Ranitidine | NDMA formation | Chemical stability prediction | ⏳ Planned | +| Multi-drug Examples | Variable | Interaction modeling needed | ⏳ Planned | \ No newline at end of file diff --git a/app_filter/README.md b/app_filter/README.md new file mode 100644 index 0000000..2689848 --- /dev/null +++ b/app_filter/README.md @@ -0,0 +1,5 @@ +Run command + +``` +vep --config vep.ini -i ${input_vcf} -o ${input_vcf.simpleName}_cosmic.vcf +``` \ No newline at end of file diff --git a/app_filter/filter_transcriptome.py b/app_filter/filter_transcriptome.py new file mode 100644 index 0000000..1922537 --- /dev/null +++ b/app_filter/filter_transcriptome.py @@ -0,0 +1,56 @@ +from argparse import ArgumentParser +import pandas as pd +import numpy as np +from pathlib import Path + +def make_argparser(): + parser = ArgumentParser( + description="Filter Transcriptome using mutated and non-mutated TPM values, and MANE dataset" + ) + parser.add_argument( + "--mutated", + type=Path, + required=True, + help="The .csv file with mutated TPM." + ) + parser.add_argument( + "--non-mutated", + type=Path, + required=True, + help="The .csv file with base (non-mutated) TPM" + ) + parser.add_argument( + "--mane-data", + type=Path, + required=True, + help="The .tsv file with MANE data" + ) + return parser + +def main(mutated : Path, non_mutated : Path, mane_data : Path): + + name = mutated.stem + + df_mut = pd.read_csv(mutated) + df_nomut = pd.read_csv(non_mutated) + + df_mane = pd.read_csv(mane_data, sep="\t") + df_mane["ENSG"] = [i.split('.')[0] for i in df_mane["Ensembl_Gene"]] + df_mane_map = df_mane[['symbol','ENSG']] + + df_nomut_mapped = df_mane_map.merge(df_nomut, left_on='symbol', right_on='Unnamed: 0').dropna().drop_duplicates(subset=['symbol']).drop(["symbol", "Unnamed: 0"],axis=1) + + df_merge = df_nomut_mapped.merge(df_mut, on="ENSG", suffixes=("_nomut","_mut")) + + cols = df_mut.columns[1:] + + df_res = pd.DataFrame({"ENSG": df_merge["ENSG"]} | {col:np.log2(df_merge[col + "_mut"] / df_merge[col + "_nomut"]) for col in cols}) + + df_res.to_csv(f"{name}_log2fc.csv", index=None) + + +if __name__ == "__main__": + parser = make_argparser() + args = parser.parse_args() + + main(mutated=args.mutated, non_mutated=args.non_mutated, mane_data=args.mane_data) diff --git a/app_filter/vep.ini b/app_filter/vep.ini new file mode 100644 index 0000000..488c51b --- /dev/null +++ b/app_filter/vep.ini @@ -0,0 +1,32 @@ +cache 1 +dir_cache /data +assembly 'GRCh38' +offline 1 +force_overwrite 1 +custom file=/opt/vep/.vep/Cosmic_GenomeScreensMutant_v103_GRCh38.vcf.gz,short_name=COSMIC,format=vcf,type=exact,fields=LEGACY_ID%CNT + +# Functional impact predictions +sift b # CALCULATE 1 - score +polyphen b # THESE TWO CAN BE AVERAGED + +plugin REVEL,/opt/vep/.vep/REVEL/new_tabbed_revel_grch38.tsv.gz +# TODO: CALCUALTE THEIR L2 NORM +# plugin CADD,/opt/vep/.vep/CADD/GRCh38/whole_genome_SNVs.tsv.gz,/opt/vep/.vep/CADD/GRCh38/InDels.tsv.gz + + +# Clinical significance +check_existing +clinvar + +# Allele frequencies +af +af_1kg +af_gnomade +af_gnomadg +max_af + +# Additional useful flags +numbers +hgvs +symbol +canonical \ No newline at end of file diff --git a/create_LM22_sourceGEP_ref_file.py b/create_LM22_sourceGEP_ref_file.py new file mode 100644 index 0000000..eb4b065 --- /dev/null +++ b/create_LM22_sourceGEP_ref_file.py @@ -0,0 +1,11 @@ +### Create reference file for borzoi-immun cibersortx step +import pandas as pd + +MANE_data = pd.read_csv("MANE.GRCh38.v1.3.update.tsv", sep = '\t') +LM22_sourceGEP = pd.read_csv("LM22_sourceGEP.txt", sep = '\t') + +MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']] +mane_map = MANE_data[['symbol','ENSG']] +LM22_sourceGEP_ensg = mane_map.merge(LM22_sourceGEP, left_on='symbol', right_on='genesinput').dropna().drop_duplicates(subset=['symbol']) +LM22_sourceGEP_ensg = LM22_sourceGEP_ensg.drop(['symbol', 'genesinput'],axis = 1) +LM22_sourceGEP_ensg.to_csv('LM22_sourceGEP_ensg.txt', sep = '\t', index=False) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..dc51233 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,48 @@ +services: + borzoi: + build: + context: . + dockerfile: Dockerfile_borzoi + image: harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + vcf2prot: + build: + context: . + dockerfile: Dockerfile_vcf2prot + image: harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest + rna2protexpression: + build: + context: . + dockerfile: Dockerfile_rna2protexpression + image: harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest + corto: + build: + context: . + dockerfile: Dockerfile_corto + image: harbor.cluster.omic.ai/omic/digital-patients/corto:latest + ecotyper: + build: + context: . + dockerfile: Dockerfile_ecotyper + image: harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest + synthea: + build: + context: . + dockerfile: Dockerfile_synthea + image: harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf + vep: + build: + context: . + dockerfile: Dockerfile_vep + image: harbor.cluster.omic.ai/omic/digital-patients/vep:1.0.0 + volumes: + - ./app_filter/sample:/sample + - /data/vep_data:/data + +# export IMAGE_NAME=synthea:cudf && docker tag $IMAGE_NAME harbor.cluster.omic.ai/omic/digital-patients/$IMAGE_NAME && retry_command.sh docker --config ~/.config/bugra push harbor.cluster.omic.ai/omic/digital-patients/$IMAGE_NAME diff --git a/documentaion/README.md b/documentaion/README.md new file mode 100644 index 0000000..b6f184a --- /dev/null +++ b/documentaion/README.md @@ -0,0 +1,1241 @@ +# Digital Patient Pipeline: Complete Documentation + +## Table of Contents + +1. [Introduction](#introduction) +2. [Pipeline Overview](#pipeline-overview) +3. [Biological Background](#biological-background) +4. [Pipeline Components](#pipeline-components) +5. [Workflow Execution](#workflow-execution) +6. [Technical Architecture](#technical-architecture) +7. [Outputs and Applications](#outputs-and-applications) + +--- + +## Introduction + +This document provides an exhaustive explanation of a **Digital Patient Pipeline** - a sophisticated bioinformatics workflow that generates synthetic patient data and predicts multiple layers of molecular biology from genomic variants. The pipeline is implemented using **Nextflow**, a workflow orchestration system, and integrates multiple cutting-edge computational biology tools to simulate how genetic mutations affect gene expression, protein production, immune cell composition, and metabolic activity. + +### Purpose + +The Digital Patient Pipeline serves several critical purposes: + +- **Synthetic Patient Generation**: Creates realistic but synthetic patient profiles with genetic variants associated with specific diseases +- **Multi-Omic Prediction**: Predicts gene expression (RNA), protein abundance, immune cell composition, and metabolic activity from DNA sequence alone +- **Clinical Research**: Enables researchers to study disease mechanisms without accessing sensitive patient data +- **Personalized Medicine**: Models how individual genetic variants affect molecular phenotypes + +--- + +## Pipeline Overview + +```mermaid +flowchart TD + Start([Start Pipeline]) --> Decision{Patient Type?} + Decision -->|Disease| Synthea[Synthea: Generate Disease Patients] + Decision -->|Healthy| Healthy[Synthea: Generate Healthy Patients] + + Synthea --> VCF[VCF Files: Genetic Variants] + Healthy --> VCF + + VCF --> FilterVCF[Filter VCF: Extract Coding Variants] + VCF --> VCF2Prot[VCF2Prot: Generate Mutated Proteins] + + FilterVCF --> Borzoi[Borzoi: Predict RNA Expression] + + Borzoi --> RNA2Prot[RNA2Protein: Predict Protein Expression] + Borzoi --> CORTO[CORTO: Predict Metabolome] + Borzoi --> CIBERSORTx[CIBERSORTx: Predict Immune Cells] + + RNA2Prot --> Output1[Protein Expression Profiles] + CORTO --> Output2[Metabolic Activity Profiles] + CIBERSORTx --> Output3[Immune Cell Composition] + VCF2Prot --> Output4[Mutated Protein Sequences] + + Output1 --> End([Complete Digital Patient]) + Output2 --> End + Output3 --> End + Output4 --> End + + style Start fill:#90EE90 + style End fill:#FFB6C1 + style Borzoi fill:#87CEEB + style Synthea fill:#DDA0DD +``` + +### Pipeline Flow Summary + +1. **Patient Generation**: Synthea generates synthetic patients with realistic genetic variants +2. **Variant Processing**: VCF files containing genetic mutations are filtered and processed +3. **RNA Expression Prediction**: Borzoi predicts how mutations affect gene expression +4. **Downstream Analysis**: Multiple tools analyze predicted RNA to generate comprehensive molecular profiles +5. **Integration**: Results are combined to create a complete "digital patient" + +--- + +## Biological Background + +To understand this pipeline, we need to understand the central dogma of molecular biology and how genetic information flows through biological systems. + +### The Central Dogma: DNA → RNA → Protein + +```mermaid +flowchart LR + DNA[DNA: Genetic Code] -->|Transcription| RNA[RNA: Message] + RNA -->|Translation| Protein[Protein: Function] + Protein --> Phenotype[Cellular Phenotype] + + style DNA fill:#FFE4B5 + style RNA fill:#E0FFFF + style Protein fill:#FFE4E1 + style Phenotype fill:#F0E68C +``` + +#### 1. DNA (Deoxyribonucleic Acid) + +**DNA** is the blueprint of life, containing the genetic instructions for all cellular functions. DNA consists of: + +- **Four nucleotide bases**: Adenine (A), Thymine (T), Guanine (G), Cytosine (C) +- **Double helix structure**: Two complementary strands wound together +- **Genes**: Specific segments of DNA that encode instructions for proteins + +**Example DNA Sequence**: `ATGCGATCCGTA` + +#### 2. RNA (Ribonucleic Acid) + +During **transcription**, DNA is copied into RNA: + +- **RNA polymerase** enzyme reads DNA and creates a complementary RNA strand +- RNA uses **Uracil (U)** instead of Thymine (T) +- The RNA carries the genetic message from the nucleus to protein-making machinery + +**Example RNA Sequence**: `AUGCGAUCCGUA` (from DNA above) + +#### 3. Proteins + +During **translation**, RNA is decoded to build proteins: + +- **Ribosomes** read RNA in groups of three bases called **codons** +- Each codon specifies one **amino acid** +- Amino acids chain together to form proteins + +**Example**: `AUG` → Methionine, `CGA` → Arginine, `UCC` → Serine, `GUA` → Valine + +**Protein**: Methionine-Arginine-Serine-Valine + +### Gene Structure + +Genes in eukaryotes (organisms with nuclei, like humans) have a complex structure: + +```mermaid +flowchart LR + subgraph Gene Structure + Promoter[Promoter: -1000bp] --> UTR5[5' UTR] + UTR5 --> Exon1[Exon 1] + Exon1 --> Intron1[Intron 1] + Intron1 --> Exon2[Exon 2] + Exon2 --> Intron2[Intron 2] + Intron2 --> Exon3[Exon 3] + Exon3 --> UTR3[3' UTR] + end + + Exon1 -.->|Splicing| mRNA[Mature mRNA] + Exon2 -.-> mRNA + Exon3 -.-> mRNA + + style Exon1 fill:#90EE90 + style Exon2 fill:#90EE90 + style Exon3 fill:#90EE90 + style Intron1 fill:#FFB6C1 + style Intron2 fill:#FFB6C1 + style Promoter fill:#FFD700 +``` + +**Key Components:** + +- **Promoter**: Regulatory region upstream of gene (-1000 bp) that controls when the gene is turned on +- **5' UTR (Untranslated Region)**: Beginning of RNA that isn't translated into protein +- **Exons**: Segments that ARE kept in the final RNA and code for protein +- **Introns**: Segments that are REMOVED during RNA processing +- **3' UTR**: End region of RNA that isn't translated + +**Why this matters**: The Borzoi model in this pipeline predicts which parts of genes will be transcribed into RNA, including both exons and introns, before they're processed. + +### Genetic Variants and Their Effects + +```mermaid +flowchart TD + Variant[Genetic Variant] --> Type{Type?} + + Type -->|SNP| SNP[Single Nucleotide
Polymorphism:
A->G] + Type -->|Insertion| INS[Insertion:
ATCG->ATCGGG] + Type -->|Deletion| DEL[Deletion:
ATCG->A] + + SNP --> Effect1{Location?} + INS --> Effect1 + DEL --> Effect1 + + Effect1 -->|Promoter| E1[Changes expression level] + Effect1 -->|Exon| E2[Changes protein sequence] + Effect1 -->|Splice site| E3[Changes splicing pattern] + Effect1 -->|Intron| E4[May affect regulation] + + style Variant fill:#FFB6C1 + style E1 fill:#87CEEB + style E2 fill:#87CEEB + style E3 fill:#87CEEB + style E4 fill:#87CEEB + + +``` + +**Variants** are differences in DNA sequence between individuals: + +- **SNP (Single Nucleotide Polymorphism)**: Single base change (e.g., A→G) +- **Insertion**: Extra bases added +- **Deletion**: Bases removed +- **Structural Variant**: Large-scale DNA rearrangements + +These variants can affect: + +- **Gene expression**: How much RNA is made +- **Protein sequence**: Which amino acids are in the protein +- **Splicing**: Which exons are included in mature RNA + +### VCF Format: Storing Genetic Variants + +The **VCF (Variant Call Format)** is a standardized text file format that stores genetic variants: + +``` +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 12345 rs123 A G 100 PASS DP=50 +chr2 67890 rs456 TC T 95 PASS DP=45 +``` + +**Columns explained:** + +- **CHROM**: Chromosome where variant is located (chr1, chr2, etc.) +- **POS**: Position on the chromosome (base pair number) +- **ID**: Database identifier (often from dbSNP database) +- **REF**: Reference base(s) at this position +- **ALT**: Alternate base(s) - the variant +- **QUAL**: Quality score for the variant call +- **FILTER**: Whether variant passed quality filters +- **INFO**: Additional information (e.g., read depth) + +--- + +## Pipeline Components + +### 1. Synthea: Synthetic Patient Generator + +```mermaid +flowchart LR + Input[Input Parameters] --> Synthea{Synthea Engine} + + subgraph Input Parameters + Disease[Disease Type:
schizophrenia, cancer, etc.] + Demographics[Demographics:
Age, Gender, Location] + N[Number of Patients] + end + + Synthea --> UKBB[UK Biobank
Genetic Database] + UKBB --> Disease_Variants[Disease-Associated
Genetic Variants] + + Disease_Variants --> VCF_Out[VCF Files
Patient_001.vcf
Patient_002.vcf] + + style Synthea fill:#DDA0DD + style VCF_Out fill:#90EE90 +``` + +**What is Synthea?** + +Synthea™ is an open-source synthetic patient generator that creates realistic but completely fake patient data. It models: + +- Medical history +- Demographics (age, gender, location) +- Health conditions +- Medications and treatments +- Genetic variants + +**How it works in this pipeline:** + +1. **User specifies disease**: For example, "schizophrenia" or "healthy" +2. **Statistical analysis**: Synthea analyzes the UK Biobank database (a large collection of genetic data from ~500,000 individuals) to find variants statistically associated with the disease +3. **Probability-based sampling**: Variants are selected based on their frequency in diseased vs. healthy populations +4. **VCF generation**: Creates a VCF file for each synthetic patient containing their unique set of genetic variants + +**Parameters:** + +```nextflow +params.disease = 'schizophrenia' // Disease to model +params.n_pat = 10 // Number of patients to generate +params.percent_male = 0.5 // Gender distribution +``` + +**For Healthy Patients:** + +If generating healthy controls, Synthea samples from pre-computed reference genomes representing the genetic diversity of healthy populations. + +**Output Example:** + +``` +Patient_001_variants.vcf +Patient_002_variants.vcf +... +Patient_010_variants.vcf +``` + +### 2. Borzoi: RNA-seq Prediction from DNA + +```mermaid +flowchart TD + DNA[DNA Sequence
524,288 bp] --> Borzoi[Borzoi Neural Network] + + subgraph Borzoi Architecture + Conv1[Convolutional Layers:
Learn local patterns] + Conv1 --> Attention[Self-Attention Layers:
Learn long-range interactions] + Attention --> Upsample[Upsampling Layers:
Increase resolution] + Upsample --> Output_Layer[Output: RNA Coverage
32 bp resolution] + end + + Borzoi --> Tissues[Predictions for 89 Tissues/Cell Types] + + Tissues --> TPM[TPM Values:
Transcripts Per Million] + + style Borzoi fill:#87CEEB + style TPM fill:#90EE90 +``` + +**What is Borzoi?** + +Borzoi is a deep learning model developed at Calico Life Sciences that predicts **RNA-seq coverage** (how much RNA is produced from each part of the genome) directly from DNA sequence. This is revolutionary because it: + +- Predicts gene expression without actually doing wet-lab RNA sequencing +- Accounts for multiple layers of regulation (transcription, splicing, polyadenylation) +- Provides tissue-specific predictions + +**How does it work?** + +1. **Input**: 524,288 base pairs of DNA sequence (524 kb) +2. **Neural Network Processing**: + - **Convolutional layers**: Learn local DNA patterns (e.g., transcription factor binding sites) + - **Self-attention layers**: Learn long-range interactions between regulatory elements + - **Upsampling layers**: Increase resolution from 128 bp to 32 bp +3. **Output**: RNA coverage at 32 bp resolution for 89 different tissues/cell types + +**Key Concept: RNA-seq Coverage** + +RNA-seq coverage shows how many RNA molecules were sequenced at each position in the genome: + +``` +Position: 1000 1100 1200 1300 1400 +Exon 1: ████████████████ +Intron: ░ +Exon 2: ████████████ +Coverage: 2.5 3.1 0.1 2.8 3.0 +``` + +**TPM (Transcripts Per Million)** + +Borzoi outputs are converted to **TPM** values: + +``` +TPM = (Number of reads mapped to transcript / Transcript length in kb) + × (1,000,000 / Total reads in sample) +``` + +**Why TPM?** + +- Normalizes for gene length (longer genes generate more reads) +- Normalizes for sequencing depth (accounts for total number of reads) +- Comparable across genes within a sample + +**Pipeline Implementation:** + +The pipeline has two Borzoi processes: + +**Process 1: FILTER_VCF** + +```python +# Extract variants in coding regions + 1000 bp upstream regulatory regions +# Create filtered VCF containing only protein-coding variants +``` + +**Process 2: PREDICT_EXPRESSION** + +```python +# For each protein-coding gene with mutations: +# 1. Extract DNA sequence (reference + mutations) +# 2. Run Borzoi to predict RNA coverage +# 3. Calculate TPM by summing coverage over exons +# 4. Generate TPM matrix: Genes × Tissues +``` + +**Example Output:** + +```csv +Gene,Adipose_Tissue,Brain_Cortex,Heart,Liver,Muscle +BRCA1,12.5,8.3,5.2,15.7,7.9 +TP53,45.2,52.1,38.9,42.3,35.6 +APOE,8.7,125.3,6.1,78.2,5.4 +``` + +This table shows predicted RNA expression (TPM) for each gene in each tissue. + +**MANE Dataset** + +The pipeline uses the **MANE (Matched Annotation from NCBI and EMBL-EBI)** dataset: + +- Contains reference transcript sequences for all human protein-coding genes +- Provides consensus between RefSeq and Ensembl/GENCODE annotations +- Includes exon/intron boundaries needed for TPM calculation + +### 3. VCF2Prot: DNA Variants to Protein Sequences + +```mermaid +flowchart TD + VCF[VCF File:
Genetic Variants] --> Annotate[BCFtools CSQ:
Annotate Variants] + Reference[Reference Genome
GRCh38] --> Annotate + GFF[Gene Annotations
GFF3 Format] --> Annotate + + Annotate --> Annotated_VCF[Annotated VCF:
Functional Consequences] + + Annotated_VCF --> VCF2Prot[VCF2Prot Tool] + MANE_Ref[MANE Reference
Transcripts] --> VCF2Prot + + VCF2Prot --> Mutated_Proteins[Mutated Protein
Sequences FASTA] + + style VCF2Prot fill:#FFB6C1 + style Mutated_Proteins fill:#90EE90 +``` + +**What is VCF2Prot?** + +VCF2Prot is a tool that translates DNA variants into their effects on protein sequences. It: + +- Takes variants from VCF files +- Maps them to gene transcripts +- Predicts how variants change the protein sequence +- Outputs mutated protein sequences + +**Process Flow:** + +1. **Variant Annotation (BCFtools CSQ)** + + - Maps variants to genes and transcripts + - Determines functional consequence: + - Missense: Changes one amino acid + - Nonsense: Creates premature stop codon + - Frameshift: Shifts reading frame + - Synonymous: No change to amino acid + +2. **Protein Sequence Prediction (VCF2Prot)** + + - Loads reference protein sequences from MANE + - Applies variants to generate mutated sequences + - Handles complex variants (insertions, deletions) + +**Example:** + +``` +Reference DNA: ATG GCT AAA TGC +Reference RNA: AUG GCU AAA UGC +Reference Prot: Met-Ala-Lys-Cys + +Variant: Position 5, G→T + +Mutant DNA: ATG TCT AAA TGC +Mutant RNA: AUG UCU AAA UGC +Mutant Prot: Met-Ser-Lys-Cys + ^^^ + Changed amino acid! +``` + +**Output Format: FASTA** + +``` +>Patient_001_ENST00000357654_BRCA1_p.G1738R +MSLQSQLFKQRQYLSIKTKRSTKEVLDATLIHQSITGLYETRIDLSQLGGD... +>Patient_001_ENST00000269305_TP53_p.R273H +MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIE... +``` + +Each sequence shows: + +- Patient ID +- Transcript ID +- Gene name +- Protein variant notation +- Full mutated protein sequence + +### 4. RNA2ProteinExpression: RNA to Protein Prediction + +```mermaid +flowchart TD + RNA_TPM[RNA TPM Values
from Borzoi] --> Model[Deep Learning Model] + + subgraph Neural Network + Input[Input Layer:
RNA Expression] + Hidden1[Hidden Layer 1:
Tissue Context] + Hidden2[Hidden Layer 2:
Translation Efficiency] + Output[Output Layer:
Protein Abundance] + + Input --> Hidden1 + Hidden1 --> Hidden2 + Hidden2 --> Output + end + + Model --> Protein_Expr[Protein Expression
Log2 Scale] + + style Model fill:#FFE4E1 + style Protein_Expr fill:#90EE90 +``` + +**What is RNA2ProteinExpression?** + +This is a custom deep learning model that predicts protein abundance from RNA expression levels. It's trained on: + +- RNA-seq data (transcript levels) +- Mass spectrometry data (protein levels) +- Gene Ontology (GO) annotations + +**Why is this needed?** + +RNA levels don't perfectly correlate with protein levels because: + +- **Translation efficiency** varies between genes +- **Protein stability** varies (some proteins are rapidly degraded) +- **Post-transcriptional regulation** (microRNAs, RNA-binding proteins) + +Typical RNA-protein correlation: **r = 0.4-0.6** (not 1.0!) + +**Model Architecture:** + +The neural network learns: + +- Which genes have high vs. low translation efficiency +- Tissue-specific effects on protein production +- GO term enrichments that affect protein stability + +**Input:** + +``` +Gene_ID, Tissue, RNA_TPM +ENSG00000012048, Brain_Cortex, 125.3 +ENSG00000012048, Liver, 78.2 +``` + +**Output:** + +``` +Gene_ID, Tissue, Protein_Expression_log2 +ENSG00000012048, Brain_Cortex, 8.5 +ENSG00000012048, Liver, 7.2 +``` + +**Log2 scale**: Protein expression in log2 transformed units (easier to interpret fold-changes) + +### 5. CORTO: Metabolome Prediction + +```mermaid +flowchart TD + RNA_TPM[RNA TPM Matrix] --> CORTO[CORTO Algorithm] + Regulon[Regulon Data:
TF-Gene Relationships] --> CORTO + + subgraph CORTO Algorithm + Correlation[1. Calculate Correlations:
TF <-> Metabolic Genes] + DPI[2. Data Processing Inequality:
Remove Indirect Edges] + Bootstrap[3. Bootstrap:
Assess Robustness] + MRA[4. Master Regulator Analysis:
Identify Key TFs] + + Correlation --> DPI + DPI --> Bootstrap + Bootstrap --> MRA + end + + CORTO --> Metabolome[Metabolome Predictions:
Metabolic Activity] + + style CORTO fill:#F0E68C + style Metabolome fill:#90EE90 +``` + +**What is CORTO?** + +CORTO (Correlation Tool) is an R package that infers gene regulatory networks and identifies master regulators controlling metabolic activity. It predicts: + +- Activity of metabolic pathways +- Transcription factors (TFs) controlling metabolism +- Metabolite production levels + +**How it works:** + +1. **Input Regulon**: Pre-defined relationships between transcription factors (TFs) and their target genes (including metabolic enzymes) + +2. **Correlation Analysis**: Calculate how TF expression correlates with target gene expression + +3. **Data Processing Inequality (DPI)**: Remove indirect relationships + + - If TF1 → TF2 → Gene, remove direct TF1 → Gene edge + - Keeps only direct regulatory relationships + +4. **Bootstrap**: Test robustness by resampling data + +5. **Master Regulator Analysis (MRA)**: Identify TFs whose target genes are significantly enriched in metabolic pathways + +**Example:** + +``` +TF: PPARG (master regulator of fat metabolism) +Target Genes: FABP4, LPL, ADIPOQ, CD36, SCD (all involved in lipid metabolism) +Metabolome Prediction: High lipid synthesis activity +``` + +**Output:** + +```csv +Pathway,Activity_Score,P_value +Glycolysis,-1.5,0.001 +TCA_Cycle,2.3,0.0001 +Fatty_Acid_Synthesis,1.8,0.002 +``` + +- **Activity Score**: Positive = pathway activated, Negative = pathway suppressed +- **P-value**: Statistical significance + +### 6. CIBERSORTx: Immune Cell Deconvolution + +```mermaid +flowchart TD + RNA_TPM[Bulk RNA TPM
Mixed Cell Types] --> Signature[Signature Matrix:
Cell-Specific Genes] + + subgraph CIBERSORTx + Fractions[Step 1: Fractions
Estimate Cell Proportions] + HiRes[Step 2: HiRes
Cell-Specific Expression] + end + + RNA_TPM --> Fractions + Signature --> Fractions + + Fractions --> Proportions[Cell Type Proportions] + + Proportions --> HiRes + RNA_TPM --> HiRes + Signature --> HiRes + + HiRes --> Cell_Specific[Cell-Type-Specific
Gene Expression] + + style CIBERSORTx fill:#DDA0DD + style Proportions fill:#90EE90 + style Cell_Specific fill:#90EE90 +``` + +**What is CIBERSORTx?** + +CIBERSORTx is a computational tool for **immune cell deconvolution**. When you sequence RNA from a tissue sample, you get a mixture of RNA from all cells in that tissue. CIBERSORTx: + +- Estimates what proportion of cells are each immune cell type +- Infers cell-type-specific gene expression profiles + +**Why is this important?** + +Immune cells play crucial roles in: + +- Fighting infections +- Cancer immunotherapy response +- Autoimmune diseases +- Inflammation + +Understanding immune composition helps interpret disease mechanisms. + +**How it works:** + +**Step 1: Signature Matrix** + +A reference matrix showing genes specifically expressed in each cell type: + +``` +Gene T_cells B_cells Macrophages NK_cells +CD3D HIGH low low low +CD19 low HIGH low low +CD68 low low HIGH low +NKG7 low low low HIGH +``` + +**Step 2: CIBERSORTx Fractions** + +Uses **Support Vector Regression (SVR)** to solve: + +``` +Bulk_Expression = Σ (Proportion_i × Signature_i) +``` + +Where: + +- Bulk_Expression = measured RNA in tissue +- Proportion_i = fraction of cell type i +- Signature_i = expression pattern of cell type i + +**Step 3: CIBERSORTx HiRes** + +After knowing proportions, infer gene expression within each cell type by: + +- Modeling tissue expression as weighted sum of cell-type contributions +- Deconvolving to separate cell-type-specific signals + +**Example Output:** + +**Fractions:** + +```csv +Sample,CD8_T_cells,CD4_T_cells,B_cells,NK_cells,Monocytes +Patient_001_Brain,0.05,0.08,0.02,0.01,0.15 +Patient_001_Liver,0.12,0.15,0.08,0.03,0.22 +``` + +**HiRes:** + +```csv +Tissue,Cell_Type,CD3D,CD19,CD68 +Brain_Patient_001,CD8_T_cells,HIGH,low,low +Brain_Patient_001,B_cells,low,HIGH,low +``` + +**Pipeline Implementation:** + +1. **CONVERT_TO_TXT**: Convert CSV to tab-delimited format (CIBERSORTx input format) + +2. **CIBERSORTx_FRACTIONS**: Estimate cell proportions + +3. **CIBERSORTx_HIRES**: Infer cell-specific expression + +4. **ADD_TISSUE_NAMES**: Add tissue annotations to output + +--- + +## Workflow Execution + +### Nextflow: Workflow Orchestration + +```mermaid +flowchart TD + Config[nextflow.config:
Configuration] --> NF[Nextflow Engine] + Params[params.json:
Parameters] --> NF + + subgraph Nextflow Engine + Parse[Parse Workflow DSL] + Schedule[Schedule Processes] + Execute[Execute in Docker/Singularity] + Monitor[Monitor & Checkpoint] + + Parse --> Schedule + Schedule --> Execute + Execute --> Monitor + end + + NF --> Channels[Data Channels:
Pass Files Between Processes] + Channels --> Processes[Execute Processes] + + style NF fill:#87CEEB +``` + +**What is Nextflow?** + +Nextflow is a workflow orchestration system specifically designed for data-intensive computational pipelines. It: + +- Manages dependencies between analysis steps +- Handles parallel execution +- Provides automatic checkpointing (resume failed runs) +- Supports multiple execution platforms (local, HPC clusters, cloud) + +**Key Concepts:** + +1. **Processes**: Individual computational tasks (e.g., "PREDICT_EXPRESSION") +2. **Channels**: Data streams that connect processes +3. **Operators**: Manipulate channels (e.g., `mix`, `flatten`, `collect`) + +**Example Process Definition:** + +```nextflow +process PREDICT_EXPRESSION { + container "${params.container_borzoi}" // Docker image + memory 4.GB // Memory requirement + accelerator 1 // GPU requirement + + input: + path vcf_filtered // Input file + path MANE // Reference data + + output: + path "*_TPM.csv" // Output file pattern + + script: + """ + #!/opt/conda/envs/borzoi/bin/python + # Python script here + """ +} +``` + +**Channel Example:** + +```nextflow +// Mix male and female patient VCFs +txt_ch = f_var.mix(m_var).flatten() + +// This creates a channel with all VCF files: +// [Patient_001.vcf, Patient_002.vcf, ...] +``` + +### Complete Workflow + +```mermaid +flowchart TD + Start([Start]) --> CheckDisease{Disease or Healthy?} + + CheckDisease -->|Disease| GetStats[get_disease_stats_no_patients:
Analyze UK Biobank] + CheckDisease -->|Healthy| LoadHealthy[Load Pre-computed
Healthy Genomes] + + GetStats --> GenM[generate_m_variants_cudf:
Male Patients] + GetStats --> GenF[generate_f_variants_cudf:
Female Patients] + + LoadHealthy --> LoadM[Load Male
Reference] + LoadHealthy --> LoadF[Load Female
Reference] + + GenM --> MakeVCF[make_vcfs:
Generate VCF Files] + GenF --> MakeVCF + LoadM --> MakeVCF + LoadF --> MakeVCF + + MakeVCF --> FilterVCF[FILTER_VCF:
Extract Coding Variants] + MakeVCF --> VCF2Prot[VCF2PROT:
Generate Mutated Proteins] + + FilterVCF --> PredictExpr[PREDICT_EXPRESSION:
Borzoi RNA Prediction] + + PredictExpr --> RNA2Prot[RNA2PROTEXPRESSION:
Protein Prediction] + PredictExpr --> CORTO[CORTO:
Metabolome Prediction] + PredictExpr --> Convert[CONVERT_TO_TXT:
Format Conversion] + + Convert --> CiberFrac[CIBERSORTx_FRACTIONS:
Cell Proportions] + CiberFrac --> CiberHires[CIBERSORTx_HIRES:
Cell-Specific Expression] + CiberHires --> AddTissue[ADD_TISSUE_NAMES_TO_CIBERSORTX:
Annotate Results] + + RNA2Prot --> End([Complete
Digital Patient]) + CORTO --> End + AddTissue --> End + VCF2Prot --> End + + style Start fill:#90EE90 + style End fill:#FFB6C1 + style PredictExpr fill:#87CEEB +``` + +### Execution Example + +**1. Configuration (params.json)** + +```json +{ + "disease": "schizophrenia", + "n_pat": 10, + "percent_male": 0.5, + "container_borzoi": "harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest" +} +``` + +**2. Launch Pipeline** + +```bash +nextflow run test.nf -params-file params.json +``` + +**3. Nextflow Execution** + +``` +N E X T F L O W ~ version 21.04.0 +Launching `test.nf` [amazing_babbage] - revision: 1a2b3c4d + +[Synthea] Submitted process > get_disease_stats_no_patients +[Synthea] Submitted process > generate_m_variants_cudf (1) +[Synthea] Submitted process > generate_f_variants_cudf (1) +[Stage] Completed process > make_vcfs (10 files) +[Borzoi] Submitted process > FILTER_VCF (10) +[Borzoi] Submitted process > PREDICT_EXPRESSION (10) +... +Pipeline completed successfully! +``` + +**4. Directory Structure** + +``` +/outdir/ +├── vcf2prot/ +│ ├── Patient_001_transcript_id_mutations.fasta +│ └── Patient_002_transcript_id_mutations.fasta +├── borzoi/ +│ ├── Patient_001_TPM.csv +│ └── Patient_002_TPM.csv +├── rna2protein/ +│ ├── Patient_001_Protein_Expression_log2.csv +│ └── Patient_002_Protein_Expression_log2.csv +├── corto/ +│ ├── Patient_001_metabolome.csv +│ └── Patient_002_metabolome.csv +└── ecotyper/ + ├── fractions/ + │ └── Patient_001_CIBERSORTx_Results.txt + └── hires/ + └── Patient_001_immune_cells.csv +``` + +--- + +## Technical Architecture + +### Docker Containers + +Each pipeline component runs in an isolated Docker container with specific dependencies: + +```mermaid +flowchart LR + subgraph Docker Images + Synthea_Img[Synthea Container:
- Java JDK
- Python
- BCFtools
- GATK] + + Borzoi_Img[Borzoi Container:
- TensorFlow
- PyTorch
- Baskerville
- Python packages] + + VCF2Prot_Img[VCF2Prot Container:
- BCFtools
- vcf2prot binary
- Reference genomes] + + RNA2Prot_Img[RNA2Protein Container:
- PyTorch
- Deep learning model
- GO annotations] + + CORTO_Img[CORTO Container:
- R
- corto package
- Regulon data] + + CIBERSORTx_Img[CIBERSORTx Container:
- Python
- R
- CIBERSORTx binaries
- Signature matrices] + end + + Registry[Container Registry:
harbor.cluster.omic.ai] --> Synthea_Img + Registry --> Borzoi_Img + Registry --> VCF2Prot_Img + Registry --> RNA2Prot_Img + Registry --> CORTO_Img + Registry --> CIBERSORTx_Img +``` + +**Why Docker?** + +- **Reproducibility**: Same environment every run +- **Isolation**: Avoid dependency conflicts +- **Portability**: Run anywhere (laptop, cluster, cloud) + +**Example Dockerfile (Borzoi):** + +```dockerfile +FROM tensorflow/tensorflow:2.12.0-gpu + +# Install conda +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +RUN bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda + +# Create borzoi environment +RUN conda create -n borzoi python=3.9 +RUN conda install -n borzoi tensorflow-gpu pandas numpy pysam + +# Install baskerville (Borzoi's framework) +RUN git clone https://github.com/calico/borzoi.git /home/omic/borzoi +RUN pip install -e /home/omic/borzoi + +# Download pre-trained models +RUN mkdir -p /home/omic/borzoi/saved_models +RUN wget -O /home/omic/borzoi/saved_models/f0/model0_best.h5 + +# Set entrypoint +CMD ["/bin/bash"] +``` + +### Data Flow Architecture + +```mermaid +flowchart TD + subgraph Input Data + UKBB[UK Biobank:
Genetic Variants] + MANE[MANE Transcripts:
Reference Sequences] + RefGenome[Reference Genome:
GRCh38] + Regulon[Regulon Database:
TF-Gene Networks] + LM22[LM22 Signature Matrix:
Immune Cell Markers] + end + + subgraph Processing + Synthea --> |VCF| Filter + Filter --> |Filtered VCF| Borzoi + Filter --> |Filtered VCF| VCF2Prot + Borzoi --> |TPM CSV| RNA2Prot + Borzoi --> |TPM CSV| CORTO + Borzoi --> |TPM CSV| CIBERSORTx + end + + UKBB --> Synthea + MANE --> Borzoi + MANE --> VCF2Prot + RefGenome --> VCF2Prot + Regulon --> CORTO + LM22 --> CIBERSORTx + + subgraph Output Data + RNA2Prot --> |Protein Expression| Results + CORTO --> |Metabolome| Results + CIBERSORTx --> |Immune Cells| Results + VCF2Prot --> |Mutated Proteins| Results + end +``` + +### Computational Requirements + +| Process | CPU | RAM | GPU | Time (per patient) | +| ------------------ | ------- | ---- | -------- | ------------------ | +| Synthea | 2 cores | 4 GB | No | ~5 min | +| FILTER_VCF | 4 cores | 4 GB | No | ~2 min | +| PREDICT_EXPRESSION | 8 cores | 4 GB | Yes (1x) | ~30-60 min | +| VCF2PROT | 2 cores | 2 GB | No | ~10 min | +| RNA2PROTEXPRESSION | 4 cores | 2 GB | Yes (1x) | ~5 min | +| CORTO | 2 cores | 1 GB | No | ~3 min | +| CIBERSORTx | 4 cores | 4 GB | No | ~15 min | + +**Total time per patient: ~70-90 minutes** + +**Parallelization:** + +Nextflow automatically parallelizes patient processing: + +- 10 patients with 4 GPUs → ~20-25 minutes total +- Patients are processed independently + +--- + +## Outputs and Applications + +### Complete Digital Patient Profile + +```mermaid +flowchart LR + Patient[Digital Patient:
Patient_001] --> Genome[Genomic Profile] + Patient --> Transcriptome[Transcriptomic Profile] + Patient --> Proteome[Proteomic Profile] + Patient --> Metabolome[Metabolomic Profile] + Patient --> Immune[Immune Profile] + + Genome --> G1[Genetic Variants:
SNPs, Indels] + Genome --> G2[Disease-Associated
Mutations] + + Transcriptome --> T1[RNA Expression:
20,000+ genes] + Transcriptome --> T2[Tissue-Specific
Expression] + + Proteome --> P1[Protein Abundance:
10,000+ proteins] + Proteome --> P2[Mutated Protein
Sequences] + + Metabolome --> M1[Pathway Activity:
Metabolism] + Metabolome --> M2[Master Regulator
TFs] + + Immune --> I1[Cell Composition:
T-cells, B-cells, etc.] + Immune --> I2[Cell-Specific
Expression] + + style Patient fill:#FFD700 +``` + +### Example Application: Cancer Research + +```mermaid +flowchart TD + Question[Research Question:
How do BRCA1 mutations
affect breast cancer?] + + Question --> Generate[Generate 100 synthetic
patients with BRCA1
mutations] + + Generate --> Analyze{Analyze Digital Patients} + + Analyze --> RNA[RNA Analysis:
Find genes
co-expressed with BRCA1] + Analyze --> Protein[Protein Analysis:
Identify altered
pathways] + Analyze --> Metabolome[Metabolome Analysis:
Detect metabolic
shifts] + Analyze --> Immune[Immune Analysis:
Characterize immune
infiltration] + + RNA --> Insight[Insights into
Disease Mechanisms] + Protein --> Insight + Metabolome --> Insight + Immune --> Insight + + Insight --> Drug[Drug Target
Discovery] + Insight --> Biomarker[Biomarker
Identification] + + style Question fill:#FFB6C1 + style Insight fill:#90EE90 +``` + +### Output Files Summary + +**1. Genetic Variants (VCF)** + +- **File**: `Patient_001_variants.vcf` +- **Content**: All genetic variants for patient +- **Use**: Understand genetic basis of disease + +**2. RNA Expression (TPM)** + +- **File**: `Patient_001_TPM.csv` +- **Content**: Gene expression across 89 tissues +- **Use**: Identify dysregulated genes, tissue-specific effects + +**3. Protein Expression** + +- **File**: `Patient_001_Protein_Expression_log2.csv` +- **Content**: Predicted protein abundance +- **Use**: Understand functional consequences of RNA changes + +**4. Mutated Proteins (FASTA)** + +- **File**: `Patient_001_transcript_id_mutations.fasta` +- **Content**: Protein sequences with mutations +- **Use**: Study structural changes, predict drug binding + +**5. Metabolome** + +- **File**: `Patient_001_metabolome.csv` +- **Content**: Pathway activity scores +- **Use**: Understand metabolic reprogramming + +**6. Immune Cells** + +- **File**: `Patient_001_immune_cells.csv` +- **Content**: Cell type proportions and expression +- **Use**: Characterize immune microenvironment + +### Research Applications + +1. **Disease Mechanism Discovery** + + - Generate patients with specific mutations + - Compare to healthy controls + - Identify molecular changes caused by mutations + +2. **Drug Target Identification** + + - Find genes/proteins consistently altered across patients + - Prioritize targets for therapeutic intervention + +3. **Biomarker Discovery** + + - Identify molecular signatures distinguishing diseased from healthy + - Develop diagnostic tests + +4. **Precision Medicine** + + - Model individual patient molecular profiles + - Predict treatment response + - Personalize therapy + +5. **Clinical Trial Simulation** + + - Generate virtual patient cohorts + - Test hypotheses before expensive trials + - Power calculations and study design + +6. **Education and Training** + + - Teach students about multi-omics analysis + - No patient privacy concerns + - Unlimited data generation + +--- + +## Conclusion + +This Digital Patient Pipeline represents a cutting-edge integration of: + +- **Synthetic biology**: Realistic patient simulation +- **Deep learning**: RNA expression prediction from DNA +- **Bioinformatics**: Multi-omic data integration +- **Workflow engineering**: Scalable, reproducible pipelines + +By combining these technologies, researchers can: + +- Study disease mechanisms without accessing sensitive patient data +- Generate unlimited data for hypothesis testing +- Model personalized molecular phenotypes +- Accelerate drug discovery and precision medicine + +The pipeline produces comprehensive molecular profiles spanning: + +- **Genomics** (DNA variants) +- **Transcriptomics** (RNA expression) +- **Proteomics** (protein abundance) +- **Metabolomics** (metabolic activity) +- **Immunomics** (immune cell composition) + +This multi-omic integration provides unprecedented insight into how genetic variants cascade through biological systems to produce disease phenotypes. + +--- + +## Glossary of Terms + +**Bioinformatics Terms:** + +- **TPM**: Transcripts Per Million - normalized measure of RNA abundance +- **VCF**: Variant Call Format - standard format for genetic variants +- **FASTA**: Text format for representing DNA/protein sequences +- **SNP**: Single Nucleotide Polymorphism - single base DNA variant +- **Indel**: Insertion or Deletion - adding or removing DNA bases +- **Exon**: Protein-coding segment of gene +- **Intron**: Non-coding segment removed during RNA processing +- **Transcription**: DNA → RNA +- **Translation**: RNA → Protein +- **Gene Expression**: Process of making protein from gene + +**Machine Learning Terms:** + +- **Neural Network**: Computer model inspired by brain structure +- **Convolutional Layer**: Detects local patterns in sequences +- **Attention Layer**: Learns long-range relationships +- **Training**: Teaching model from example data +- **Prediction**: Using model on new data + +**Pipeline Terms:** + +- **Process**: Individual computational step +- **Channel**: Data stream connecting processes +- **Container**: Isolated environment with dependencies +- **Workflow**: Series of connected processes +- **Nextflow**: Workflow orchestration system + +**Biological Terms:** + +- **Genome**: Complete set of DNA in organism +- **Transcriptome**: Complete set of RNA molecules +- **Proteome**: Complete set of proteins +- **Metabolome**: Complete set of metabolites +- **Phenotype**: Observable characteristics of organism + +--- + +## References + +### Pipeline Components + +1. **Nextflow**: Di Tommaso et al. (2017). Nextflow enables reproducible computational workflows. *Nature Biotechnology*, 35:316-319. + +2. **Synthea**: Walone et al. (2017). Synthea: An approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record. *JAMIA*, 25(3):230-238. + +3. **Borzoi**: Linder et al. (2023). Predicting RNA-seq coverage from DNA sequence as a unifying model of gene regulation. *Nature Genetics*, 56:164-173. + +4. **CIBERSORTx**: Newman et al. (2019). Determining cell-type abundance and expression from bulk tissues with digital cytometry. *Nature Biotechnology*, 37:773-782. + +5. **CORTO**: Mercatelli et al. (2020). corto: a lightweight R package for gene network inference and master regulator analysis. *Bioinformatics*, 36(12):3916-3917. + +### Biological Background + +6. **Gene Structure**: Lim et al. (2018). The exon-intron gene structure upstream of the initiation codon. *Nucleic Acids Research*, 46(5):2232-2244. + +7. **TPM Normalization**: Zhao et al. (2020). Misuse of RPKM or TPM normalization when comparing across samples and sequencing protocols. *RNA*, 26(8):903-909. + +8. **VCF Format**: Danecek et al. (2011). The variant call format and VCFtools. *Bioinformatics*, 27(15):2156-2158. + +9. **MANE**: Morales et al. (2022). A joint NCBI and EMBL-EBI transcript set for clinical genomics and research. *Nature*, 604:310-315. + +--- + +*This documentation was created to provide comprehensive understanding of a complex bioinformatics pipeline for researchers without extensive biological background. For questions or clarifications, please consult the cited references or contact the pipeline maintainers.* diff --git a/ensg2number.joblib b/ensg2number.joblib new file mode 100644 index 0000000000000000000000000000000000000000..814ca0c3c975cbb873be2dd1d6c54734d13bc874 GIT binary patch literal 124367 zcmZtPRlprZlxSgGg1ft0_)mM|?(QCv5FijcJHg!v1P|`+?(XjH?(S}PH8b~N{$BHN z?@WC_a=NRlcI{o&ec%KqPdDy3|NYOH*+=d=WKjPY)^|h<8WF<>4jMUf@UWfw_8ZcF zz@Q;x$4tIX^Z&0pY~b)62F2Ku#x8Qu*!{*XJZ6%CL;8;#IcQ|Rv16v&W%z(0g9r2- zw%gDVd-WSJcp(41v16uLulYY$jOf4D=Fxw|i2w61kKAp*fB*j2F%$mxZ;c%@@vt5G zH~%bu!k7t1?%aRGpt0-jKHXvbAH$>hgU5`!GXMJyqxJ%Jp8Ws+Z~VjKtzV5DedG~i zHyXR}+T)KIci*w=t+(D<|MEZo{Xc6jKWglNF@5?E88Up2enW@$A2D|HnA!g4FZ$mf z*RTJO!8;8bI%wFaegpOzHE876wWr=~z=7ip{O=#>J8*am>N|9|A)^NW_Ye2!7lQ^4 zA2xDS>^5*zpYAyS^H25p-@kdE?zn4@b69u0f&H3GHE?8~LH}K|KHc%RXfE}D?gU#* z#Q#5}|ER&kI^78m?@qK)cj6NEK|ND>a+nuWS&+JY;VDz@#X%^mMD*l~S*V}mYp8x%yEHAs$wp)vy?sUCd zI~lFhOKYb)L%X#uyEC?0i=^(()Vnpwv${KTtF_FE?kug=x>I*&ZMUXrcedWGX-{2r zXK%G8{AqKvTGKl1&e>|sd+P37y<4+RT6gDew`NIqo>psG*WGzrtp!(dzE*2l6y5pT zt!2?&pw(KXRd>O5E7xoxX|1}gJD>ODb$5~8|7_mr>FF*ityOoic59h;7jL&#b$5w& zYn^rbwpx>(qPt|fHA}lat=257x}A1wQFoJeYn66WX)U^0t2OVW-MrOWb$Ysmv?krM zcWcqpQ+BI%Ym#^Cb}PMPsa9*JQ+1bawI)^4U8dEVCS`ZoR%@20-Q}dU?k?YI&GWpw zLaVi)KdjhlEjvkfC27sOE4N#-th~SZ?z`;eb;EUrd8Hm zv)x)J-L+b+dD7EeyWN`Q-E~^6b&_`1?cG{6cWAwKYf^RBZ?~pB-3{8UjEEbyT9dNu zZq#Z``8#YZtyy=Ib}PMP(^hNVDY~1rTFW%=Zr*CGc!sv<-CCDL-rcg@${pIO-CFl_ zw{EpoT!(Git@M&@Tdj58)7`FDYp18f-=SZtH7R?#+qYY}4*lD$b>1D&YE79%2DV#y zvUX^<7M<>(-mRSuW7LlA*1GEM)M`!mXLoM5=2drayOo(?mv$>7?XK-se%p{%Ys!2v zwAGp=MR!=cHSctXOKa90(QYlPZnxE%(^I2&YtkvR?#NbaGwY3NwN^>e-K}?P%B}D2 z-fB&%s=G(KmAQM*R%^=NVXt;8Csf_r>O@75x%JKaNDtyPhA59{4pCftj|d$(3uPt`r5 z)mktY9@%asvKigGweIvJ-J{yAJo87lTC*zej%l?Lm~@Zn)tdBlh#Ze?w^p6*ajn)Q zFS^I~ZtbK*HYc=N%OdHX*t<1pM&*-Qt$9&)$M$Ybd$LaV~v3QwWiD>r?y)2 zGU=YyYAu*APH(kV^oKKAtu>pmCtXrR$TQ9dbbuuo^&s4wU&%V7xiwf5}wVATdgU5@{-=IHIG2|(q66HcjD^H zdbf6Z=sTCUS___`D|)vw8g;r?wp*Fsu4=V55%ksVR<8Or?N-LrYkRk53BTgHR%^~! za((aCywl0MH?&*nlQ*_nGoIL+TCD}+!_BSMk}>L*-mOJXo^@|+wKnUw+j_SW;U(SM zTdfIyvpZU?8DsUGt=1wfyLYu(OIECR_in8c`olfaT6gblxALdm*J@3fkMD1_=8Vb@ z^lq)2(f+|!Ycm%<)NW;@eYjU^){`)!KhkPVxjK)wS_?+y$J(v*hsS%jb}B}KCt9sF z^X8MiTUmT1-KW~E{IgHDT9c~iKGSMVJ7xFTR%=FF{ao)>R^w^+`BrNaN4?N$%{y85 z#on#_XY1}uy<6$NW%uP)Yso!+rPW%oHhr~sYuVu!zt(C^c)DM2x6;?&Xtk!qsc*Jg zbDr+Eq_yt8-D)k8PWPQwYfbNaw|6VIxaz*wYVB}0-*2_1JcS>$TbV^ZY_(?0ydSk& znaMtGwdTy|pR`&FuFj{e){^z{XRX$XU-5aXwPyVNqIYYZ5OaOmYAu*eziPFX^pdaJ ztwq)SMq2Ccx4l~P|5hE}wOX4D(f6&^=9>M`ZYA3OvDI4Aqkd|)5>@=%yOl|#?*7th z&BtwEV4(bNul z-`c$Dq%plg-6>-Rg}f(b6d(xciJ27YGG;a)oR3)y@~Vhg1&6g zSjZqRidfj7$dXvZprl_ds*u&Om;%>oae+=!rm=)US!dB#An5>08jy#L9)ld{DAZLX z1_gaLH7FUcG69wzdE^G2qK-nLETS|Z_KwOSkJ=zjl32>1EIP5YK*st%mQmn$EUQr0 zv7ABDsbYBpLdjUcAmgcB(IDqBSjm8eXsm2Nf+SW^;Hh2Jpyqe1CXn|OJ+Zn$QdF^q z0TrfLQ-MKdErq%hYa8TDck38XwTN{UxKZmVFh{L#KtLKBIMlJB0uRnc1|;8OV+Cfl zO$1mQWwEJ2r^;h9gM>@6xk6b-Q$X!Cc@;(T7;I@k^cPzx@NaBwkaGvOQDCm!)}Y8c zv7JK}{R~P5AI7cclv5a!a zVmE`5D13JXo~S(xDq@B`6?oM4G9XbBdkc`S?1_C0Iy^Z0DpXbMXVCP-{T0eQ4lqaw zqYgC47?ut)U>`snY*5oh4{1XaKla3-21zH2!xWg$4>w2&qmD2jQ5Z)m5CM!iY`##>RxC-MJFyYNQ*2kHpm$IE-`2pqn9c$*Is7ObkWNVs-85iP++});_Gj1o?r zK&1bg0s9Ezb%TOI{|yC(j5iIa(#Bf`6|>sg1~qZwI|4j6X}qgY^~8Gyove)a6ifu`=~o{sFj9Y_!0-6fpm}OP6R3K)h@UG|o%q6_Lk99ogC?u?l|hBt= zsh2XS`CBcmz;w5a0EIQ`9}1+^mosRF@Z}AfA$$b|CYBWy_#G=b)UmPwyXImQgQmBy zs=!}&HG_hx%<2jxw$?D<&_S$eP!aX5CBP1oBGy)5Hdx1?L-BrH1v6>VuylTX}A zfY~mOtql^Mk8KQ6;`VJ7NDgeLz^(0P(ERf46PH^^DU4KOHp&IcMa^}!t+${3_T zZ{N|NqM)#o0)NKN3MA+TI~1{t0v9xHOe84-4qD6cXy!vVbH8U_EaEd*h_)Pa&HA5oP7*Zy6CmHRcPuT3dBCgE3iH31cQp@>xl{s;U^h1S)8#7 zWNS}$$m0|R{)|%vnmvngngU()bb}5f!x;)Be$Q0km!D<8R^&L_AR_~LjsnlexeA2N z=NUBV=<^j=t6gAFu;9H=f#>`pg{J;t&}^-{#GoP~yVRiOalTA}es#G((v)SdP~fk7 zr2;F+s}!h)U+qxDH3n=fjcXN{4X!iD$fRDcK!>|QfywJe1NL9XO$HSg@n!{*PPaIu zajOE4+HD3k1?k%rsJP!Dz*;nkI}JK?hPwnZHuJ{a1~nt~Jqo0}?-gJ}QxW$Wbf^j6 zZ@?Lzc)+2K2Msu(84oFtTz=RgjYkxiaUM0`{8>C^&{P^8S74-mLV^F(lLiGt##06* ziIk@mn7p1bXm-jztH8+coIuee2cB2pBEFzN4ELf#8ZQ}?{H@SD6sX;OtiYP{69e+<@u@=@pDD1K_*{Y6;0p!%)t3%geANO%i5`ZGuN4Tkzj4Uo zTZcNnQ=ne^y#lxP2L*QN{iwhX{K+AYpB?J>#eg3lzdB^`n?oMIJJj)q0)zga3dD(j zIn?pD0)yp03e24UD$sq#8NYX)fuG%B%gwkBWsGN#5OR*MKze5ahax6)sA3`of})8X z@|eV-j7b%U6ZDloP$ zX}~e~=yAxSqrkOF9MVV?7!xyxDslzF^Fo2(u5?JFQlLs!JJhk1K|x%yv;xohG7d#7 z>%jRB1>*MQ4NAuL6%@GaD=ILStfatRgOwGCMOQJX7%W#+pt`i00>Q-U3RGp*Fld%! zYbp@EuBFhN|8SuGAwVv(j&&6XAlFl%FtWY^tD+4YIRBx*@7Ty8i;Wei8*F0GVXMlf z3KZNnQy{IrxdK0M3x_JURA4OG%7ODA3ZxCUaj0Wk1s=}r9O~$2kkEa$S0Iw5YYtS433kA|tBNYgvMmbcmn*x_)cZWRoa42I>1sFd!deP*Wv4R)JyZI0d#k9 zfe870hcYfuAbED7K}IjRNP*|$Vg-KSB?e9L{!$0(9}29zE;lGyM_r-7cy*;i8doVW z09@_RiEA7<|6x$keXdg=zk9s`1)duexF0tQKaE4rM&9z+C%;0+Gy<2F)R}rxeJ^JnfLhGYUL|&nhtJKc_$`<#`1r`WFm% z^IyEEz_9d^LlrM8u%drOf#30}Ll&m7$W z-c?|bdCwt>_YHXMbbO$|ivB|f&VMNIaDJ>n4&W07dg7-Fbn4Fx3LfXr4cLbmUnuYc zzf>TS`N|=QuMH~V#BUt(_*Q{D!*>QXg~RU^sPO)vKtcLP1y*-I3DDot_*sE2`ilY! z-d_!x4a&bMP(S}27_{*J-MUC_fH$zlANucb5C2pk!~2&4lhNM>&7tFe6qs86b*N*U z3ECA9PIPkuM1ktvcm~Z;&+!!)_9t+tV?qTYtceu3RTCRD>(ogU7)K{n;Ij8opj~4rNSXkW-SL(x7=^*HjK^Oszn!Xc`69+tVuW5|QZ)N^aEj3KX$tFsR5I%;-?W zObT4YnH7lKXK~15RtHXiI8-sa0wvHn3~ENjITd(5=2Bq!Gq*z#^C%D&&FfIcd z^DFS?s0AGASWtnhyO02{GEQS*hbk6P;0`XTz#zVuL5C@FaRoZu5)NhbRp7EOsla-o zM}cK^$ALFs88qi>Qin1!hdOcv_JtJ+bcWI)k4k}+RINY=xs*d4ODi-dKn$95zsovs z0>ps|hypii1qHT8uBgDQwvs~;D=Y9AtfIjDysAOQpud^|$>P-&D8HCmvvWTM0^98!s_3u4WgnnG7agd; zW#7S}ia`n-quEh`nR6$HG6#&Q>5`IY*#ujys&|(24UD z=!xem(BUpnVBvnDL9=n;A_bn>iw&AL%3Pwrpns_X3-`+uSX5tbz!wPO3I)QCD;2m= zR~a;++SLk#=hrx7ajikKar!z1mP^+=a00}jWZ%n;3OoikITUfTLlw6ua7k`8s2JgH zbKnhF4t3n2!020_Ea+6V5@MbPp&nygukq#X}18 z_J}iK0o>3qod)A;iAn}|6 zf!Fg2tZ`mY;5mQMpxLJLk^%|omkl!hjaL--TfM5lR=L*fs)%l3ZzB;RUm*I zXTo;CqNV!sV7xn^y%Zk8?X$T zmmW>7K$2q$gPJnOlnN~SrfNfT@`n0{0)f{w3Ovr!Dv*JkPJu~ydWSS-a42F%1q%K% zIpi_31783!Xrl009m<%^f%=C*^ID!c6zJ4*Do`7p%OE9AoLhmda`PB8xz~9e>X=V~ zO5FSoMJ%8|4q!nA!oh_Maz^Tf4fqsjETTYAw5S0eE{eqzc+M9$XwJnip}^zZSAmgw zNrx(W44Ur`bsUOF6c{X10XF1!B2!@dSFXUCvoL7RVV4SAt4e{yZ|%Sd5QFAO_tFaV z_GJ`UU@hxV#&QZo`pYYD2Ujp?UR$-I0t4Af3KT|GR^ZQA#ULY}xT*qU`)UR`EBe(H znAO%$VBx-|LG!kkwG`;>YddfPM1i<{U57N*Qy`dF-+}sv0(G?w9je$!fu6XrL9@4J z69pceO%;fcH*+Xsa|hmlrNCUfr2@O7wo;%oY;91nMBYY$1l_g@{NdX<@C6VB;`Z$o z74rLstz@UG;LB%gW!6Atg z9ryx>0*i*R1~rAMlLhFy)ISV5tb|WhpkJM4(7dPMbOrW*p5ai%nF_pU_bdhSyJs6T zFXul;f!W|(hdRzvpxSr70u`(a9E!NmAmb0eNP+utu|dxI;}Qjy8J8+h`MJ!YjLQw0 z_ds2tz;2N%708!dWzbX_u2x{>b&UeE+O-a8TxU=*b6)R|#0?7kb#GMQQM<{YCYgD& z0uk~p0-Wus<5mT(?rjcf+^#@g^bUg#;ozMLOcQrGlySE~^98AU44S>Q_Zl?ceY;PA z@ce!SB7g@J2-O}`V0e8?PUc*&Q}z8&R;cX%4e@B z5aPV9z+C%=1N9FD3es;0WISp@^>>>iAlLNdFrHzRexqI`9P$hbq2T zU|9OWA&(yws5}4Uz!yLinCO2oXx{7it3w{YDKNbLuD~JJKNQ$t_NM_~){egvcs~Ai zp#Gsizxr2!sBfH!TJ?`k-h8fgTm|a2<0)_v$9G6$0)v7-V?qTILK7)4<4kPOeC}cr zhfYlD!1)gYK8GEXDUi9F+@Xpo6d13jG^pt%Qz#RvnMHxYa#jTb`q>Pc(@wK1FjCLqkj9(}Jm+&MkYSly zfhc1h1)`;S6`_ash)TIk2Dt)5JmwEUFhaXx=cohyrQcMHQ$`EapJ{ z!=Tx=u!I7gp|1mPz;Y;~$DxXj0+&5epkJj1&1Zl!1-1_53d~W30+Vp*z!yLiD2&tw z&6~EDQee7UT7f@&83iJMWgR&Gp+JPZyg|*1eg%c*4Oj|1I4dboX;|5T^B)S_!BqwL zARJ!+G3byyTU~+86l*9jN3E&A!?~6N^$&+U)=?lKw5~xy^>sZ3*3Ro2q@?sVP~b7x z(4hHL{YDBD^)^=EMr~q{li%G`fs*NF3e=r9SKtnAp};)2r9&B885G>}trZBPwsGM6 zhXT>kb`ELuQ=p4(?@&g6gOVca00sV20}U!7%N-Qx41*l1*inJ0XeWc3*?DIHHZEo{ zSb>;f7X_BDyBc(uqlOqXb%UV_EW?K>FkTHeXuh2_LV@wBt3Xc-1HO0_BNgaZqZFvP z@1{V^u)6{y^&ScY+j}|`v6n&f&8fW|s@O+?1jN1u1r^2p44TU5{t67Q2N+ZYEC(uZ zNe(h-HaZ-vKumpz0&9aq9XfHC1Lr>!=%Pmm@R!QsNCmp+XoKbxq(>Pv`x%Z_An+RF zK>foZk7Es*6QIXA@CGae?%)XuJkBQ?WTcQza^MT-3gnPaR$#?)iURliR0U$6(-c@i zo^Fs+7&$|M2j@%$hK#cu(l}dzk@_43B7k!hxVq;#)N#H+!N)5uP$21ap#qDzixdcJ zFIFIKzr=y_9}0xEml^O9{}_bP`pu6C&78U?z~wGL%m=TOD<3Us&| z3@XZ*H!84vy-9%|c(Vd)wObU30B&{Q{D%YQKNJWJ?l5Qu`?{sC65_&NI*QUz*zEx z0{8q$gM^6eDFv1?Pdo4iEQcbVRp1YQPJz0?^A2^qV9*@%cu|1?`6UP5fTh4Tl2;t~ z0*C@Bi`NuL(7kSuaR=XUp#GsikouNEGZnq9z=QLS0_C%J6_~c)bExBe1-j@53T!F( z(4b&q`AC5f=VJxdIG-r6O8wNKh|d(5AU}8D4Oj*xSNBT=^1EL-l<~C!SN9u(<})ha zDlmk9r$86|UV-TK2ZQEY%0DXbJAP6i7X8_wieD7SE&VFMMt$lZ3gi=iS73YV9|q0G z4F7bX{-Hp5;BSM3veQ2fo%mOQ$9bHITlJ5m(|iVaTm{}dIGzG)gYgZTV%P)@d;vs( zz-uA}R$dbu{P+GV1tR206^O|C807Sl$rP9%CwHi03WH`3%ajTPEK@1)1E)498J4C| zU{lDn3Tz~u&Y;=KHN65|bOr@h+cP>)|1hXo56-N>Qf3wfa!a!+kocWVfxqtT3Y<8c zLx6~d^B)QfWOF%i{zHMPu2E^Lsp z#bXf#0+vM$GQz0E6!=dquE2w{gaYZEz7AO|slfBmV~~@2=_rsxP867(Qw2t!%%O~2 zfx)s+VEaX>Kr*v3C&q_q5_fS zN(%hIl@-Xnu3}KJO?OoVX0_EE(pcS~`Ihn;3QQAg3J{%Vv6cg0Kv&?Nuj7!%x(3Zx zS=Un_5MEz_P0$-CFym~fKs{fZrf#1~Bp+GRPw*n=IeH2)E?W;g`a6bp?9|{b8 z2RLy4!-4Z32F<&74t7Z65C_hGC=kgUra(I3aEB_6P@uOTslaxz(F#BQV#*$+lvN+BlBS<~op^g(2SeTrsz{2Dt1yUAc4VoD4WCiNyrzns$IMpGK z(-a6lPFJ8)pW#6LLxJCMmO;TCJlmm)a}>C>=PD3ooM%w-XPmFVf9e7SR$dn>(04C# zsN-S<=D|x8h<7h_$m22xzJP8}F&kWAP_sV2Qh<*GmT{E=L-^GO9s2Gy3OqR18YH9> zu5&2kdV}Ws**7S#48PGKi<=Z!(cf&4ka!@C6VD z&VM*?{=?_H3!arC@_D#;gH0e4tcz#K&O6NfjIFUgXY^n?<#N+-&3Ga^*+%0QbUs0 z9I5y~{~dnfhYEyuA35alu>%zl2Tp(p6wUjrK2u<<{@kEhJ%8a)#+L?7h43o{0>ZBq znhJ;l9q3z!BEEB|<9h{G=sy^wT=pLgGQy&t6sUCmtiXNu0MDI#I-6-6>X=r6Wy5p|{Eq1rsQ1mFz@Fb39nzS|pjph!tiUCiMS($PRtG8|3haoR z-64-T9EzCJfiHj<6fEQBHYf?C=20NcF|R>IHe)^oc3{n~!2MXjpym!P=#a%i3T&KS z*ntX&0Q-|Vv8X|ZttX2qF#0U6z(~D>K~u2lt3YpGQh^7jM}f=UQD6W_6bL6$gQh5y zDX<43H^|7V6$;$JQh~BXWzc+rtaeCaDFvdqr5%b`MuET8vI=awUCx0Dhyo+S3JP4S z6&;FL$)MohSlOYDRSZg2wyP@8;Z`$fUShbq0(G)A6jh{c5m76}u>~PTf_3fozBZHLRfq1?87v3Pi}m6paSB+2@nN3^@$EuoTNZ4Zma_p5C`6X<-i-T z6bNxnQ=rm$x1u{jKD=>3jp}=f#r9tzlud5t7akT>X{2GTcu2o>j zxK4rk()A8$+@L_5c%uTDqMIDDxY>a(fEYAcX z-yw|;6zFgt8kEGlA1M&Meyl(#>Fij{*gre-)Uc#+kHL|45VO)r#Xf@CGae5-H<5WHEsP zq4R_eSxn?m#Ka2p_DLN00*C^SS|5ihCR1RroZO(vSx%wA9h_2u8D}a5I`z~FOf1tV zFz!yPK#puWgJ%Ed^bXWN6bPecbfEsBz>qPs17AQ_po`AxP{(WvFQ6+B+@%gxWD4BDT!BPmA;24-`2vUn zYtBl6^iJ)-8?X$T9P-i*SuEpF#Ig!h!=UPXb# z)~W^xE0)z9idfx&FQ6;1##vK=8?}}}N(8XB18=}`C}LfQGS*X|cDKGk#xuBq0(qYe z6*x_{kwMOYys-kM#!UM#Y?28S!~)E;4w5%nFZK=&E#kj7C8jJroWa00}jc~$Z;4pkhh zK=J-K1(F=c8#JdHPEcT^K2d@E)kz8@$;UcS|8SuGVbHwI;#37H4W}t^qfU3I;tU1G z-7_7ie<+Y=INO0YU>Q_g-E$rI0*C_n#Pbz6Gjf3f$(jot_yW2?v#fh}pb8Z=q8+Z0Fw-mXC4b%#S8cPcPP-KD^_x?6#O|LD`MfAHGuGR9RP0vOMsj`0;3kS9=J^qJ70nVlz6U>!BF0)O2}6j)76s?gLw z9QXpdLmiVVkQ|u8p@=CJ*uyfFL33VqY6UhfPNP5$d0GY5wbL17ESIKNptsN9P{)i4 zq+VuHV6{E70>RxZ4oS>v(7d{OHU*}l*%g>^=1}1InA4!yNII7SQ|jCbqz&g$pkz9) z1784fC}VzyIuw#ImRY6XaqFbhyP0n$_JB3QUlF9qL$8 zfh0$d0ufoqf%=C6@1{r{>c|urOL7G=!i55tq*Ne%5u+UjT8aVhsfz=QSOw zSj(Wp4_w=U`iBCs=(-Mk0Yrh&d3^`Ye<(2BZRk+OMh4A`t~Pey3+M`rKASr51#|@h zugw*RueMO&mv8A%$5smD0Je6Z{$bGU@!D2_@MAj%zJRX4jI+H1^$!J3a}01uW1s@P zeFp{l?jVEaQ!P6>@C9@S-hidRB^j*1?7WKtq1vtrBpQb(@B@cBWHHR3;P}vR1@^;@ zP~c4yT?M9zP+$lj>5#-IhdOpMXy)_X6{r>M;ZVe$4rT16K)>2sf%$wNha&bhXil>3 z=TOA{3KaDYaNrGC4tX4;z>smULlTELWO1khA2xSR#*hDB>uC znm_z#hb+b@aBGhd;62oQ0bPNZ;W!0$U?1;L#0d(71}7>|1vp88%`IaUNCKX0&@7Qp zQDC`rssgw6GzaP*3Ium&C~ybQRAAY6mI7;>vmN*XxfcFK)9&VgJ!yWL4iWQKgO3XIgRJMaZ`1y-qVDlo{r<-qw51p?uB6qq^RHE0%b?>X=V z5Cz8F4-{C}eyG6j_{gD-j}-`=KQTzD_I>J*#b*k1hR+pvIKOb9{-HpJ`$~Zb`D=sb zkia(ztc1T+U}5r|0$2BYhcbRJ$l2rdqXLD)pA;x({;WWT<)Fz zp+E#Mr$ICB&gH=Q4+VA=&*M?b*Zlconc7@=Gq1oX27p|Fi&!Wy z(U%4#35ZI8&QKdvEc=#HAal300vB-^1@@OLYfv}!4+VmW`momGHU-P4v2+ z0(qbH75GnW;K2K@960}BkW!r7Sb^>Mn<&t)Hg%|CGlPs>-kUq*v4sQgzfvH$+e(3G zX={ffwsGM6hXVa-JBK3rDNuabUV(&Ae}m@L)dL)|80b*M4hr;=K?>{_-BE!+cqaui z6FWO3G1!6g9}acws=zX1h(i%W9qJgSz(haXpx~E}P+%F}RbW*V4tb1JV8$8cK>fp@ zq>JvZz_`1I1N9Gsiaf(!1~m(;y%i|%?4v-4v#$cFm;DqNefAe%9HaiBK+JHU0$H_# z6sWi#Y`{DbhbRzu9cs|L8SgL!p7X;MNI)FnP{)x5DIIRK0yEB02F=Sfk9MeIi~_;- zF%G={N`d32$0=|Jk2lDPmQGNh@1E$8#Yql)0mPw>lNGohrx+CEEKgM+dOgh{kJA-c zd7Ys^P;{n47H26C0i5l?`40u+l5-uXe>jwJz5;jf0)vtqz=aAt=NB1N41E_H)TDPV zQJ`O4s=y_=On~=jaQ;JqOwknz>{Yu`f%xhw1bOsVaPWQwLYxO2I`Nn<8h$%rG_j`3Adln ze}^gZNdpe$##0Ww0ZV~(>N5&Fh0iJwNImC}#`6k{?k_lS0>q(;mlU`pFDozvyy8&D zs|L-BU0+k+m%pyS?|8!@jW-pDc;9lM0;0eo`5l8MoAIs!ao>9iBwyZFpj&^SKoaRg zgOX?JBL(K1j~$BmM1k1yQw8qeXAWh2Zcy=+qc0Q~n7<2-sH&JW*de)bWi1nV)YB zY8GYRDG-2s?~uh03Os5*Dsb6;#xZfg;4D27JUe`Y3QoCR3myHMs*{08t=$Kczz%Qz@{)b!rE`fo{;uYSTIt zF`WY8-Si5i8fQ@8iJH-Y3W!4X;VJi9^jMu0gK=vQ+puoj+6 zffU5t3S=7QQ6LVVSAn-D%%{L$Iln^@3pkXqph3a#x{v}taA5^T(M25cSk!@UpgYvD zxB}~eo<50%B3KX~2Q($6QUx94K1`6!v+)#lV zwUGid=f)15*u)@XKHpScBV94Vn|^hbs_e9N|#JkqX?X(GI);OMx}c(F&w2#yIc=5CtC2V-?uKf1Cqf z05M4UPo3b9#EA;zOHOj20-`{v>STlFNZu(5Jm;r6WO1595vMCK8=RrQ{Ww#BIqEEf zoI(F=1@hBIaG160=sc8aY*A*hb%5r zVEu8q0%@u%44PQl_m3zLw?FE@H_#pU0*C^8Yo0L38Lysn$m1!8 zBA!-Y{qc+fm7HfCig-?evHf`kX6F|Sn)m6xsK7GgB?ZQkmmTVOMS)rERRsp**Btl) zhyo+^8xD26X;6|EeaoPt1o5^5UjR|yF?d&j!0SDOnh@uG0p6um#|H`|AU;$e3jfHU zL%{N}0zdE*hblf*;Oc&6kdW*9T!G*5g#sD+FBMn`e`Sy|kbUit#WxOo0mPwO)a602J1)|r# z9g_IRA&q|(NIH!(MZ5k{4Nfj3|oG#}#J zMS)3pSBE@?IB)_)fw5$m0?)^A1=h7A3=&pFT?K~lP+&pNY6NdF80_R1G= zrUC)vSq?>=K=+;)rAHLL&il8bzH2#{Bel_m7hx; zlDJHPrRC)YDR=M+1@a|VDzL`6%7Hgv88q*LzQ%zsfG7|SUguE8^$L_yZcyNt->AS` zdy_$PIP_)(HUiz^P{yqajP186a1n2J-~@<4Gs4}ez~AaF1y&PxD=IrrUF~J-ZE&0rMDHB&);z<;$4HBVEa7<27vb!2<|>GXkPvEp#o*V zj}*v}eQeNdR{cbQA>&gA&VMLS2K?NhX0H9hfiIvd@B_b6;4%1GfpG8}1s>;b9XS7? z!0i0JK-NQW_k%+jKPoT@|D?do`LhBK&Myi)IKL_|P5h=nuJ3n)CieM5fy@4<0?Vbp z9J2V^f%6{!mK)CzRCX$&$J?$au;>t#9x z2IT1#$QsO`z+*6@0#EHs2F?4EW>z4qokf8TH>(1p=xhc};5EAfmwgV0JmxegnbqcU zsAFyg2K{*yn4{)ZAdH$%fo0$P4rwf4(A4f0RA4EykOObPQXq_4M1c(Cq6*xP#SCf& zvc(;E1C|2QU0(%4oFx@V*7PVaEOit(Zk-5_N2wxJpkHMUMdS)pcnbwq+ob~Myeb9e z^ICz4WhsN^_1H@*@H>`q$YWUrhNa~Un(W~63jDwo9I{x^p@@|fn1okW;MT69Kn8ME zgOsp#H3goJ)fFguui;R~nhJE$wG>EsudTpfxsE}`NWHE>PSR;T1^)2$6&U(9aNzuh z0`cxf1_dMB#tyszOMxAYn<~)TH!~>dSDPykmu#Ux0%A)Ae#ce{q!YF_sL0T7qd-Y$ zTLso?+bNKz(6+2fj3|oG!>KG z6`1b!P#}8UQ-Q47UJ4Wx_BLovZSA8#=5AkyI`%VY4&LssKzwz80)xzf3M||Ya>(Og z1yV1EC@^y#YETeH9j3tk-NO~=)JG^#+dk4EkI@Q*QAZh+%xXs~@b-%_1{F)oV-&cE z$0`tPALmfT@d`w*Cpgq`q5^e;lMHH>OJf!I>z=H@{Beo`fWeK{kzEFV`%S8(Gl8Y7C znR$sp#x!xM0t4A)3d{zV8#Hy!D-;+@u2f)}xJrS*>uQ7MTN~FX(0#5|ARBd^0x65@ z6-Y}Cbh>bEGcmFrf6iV){E1u{jqJCt#U0$GDQ9ryx>0t4CI z1~q@Hdla~c_bTwq?^9qMb-x1P-~$2-qI?0}A&rL=m<=9QAV_^gfe`0W1u`s;DKJeu zZqOl;dBTA&pewNWc}js_{#a9jn(#PJkZxQ}npWTPff zVBDQhp{ai;uB22D9LQD6W_6<8u?3Ou#B zL36C2aLA%mU=dd-Fv8Uioc~bZue-DZ=RXwayUQBnT&v|2NC+*jz~W~G1s={79XS7? zz$$fRgMv9~6$j3LC@>(erofoEx&!aOGT@8tv8Dn+(OL?`3~M_Sv5o?R<+=*Q?dv)4 z1rUR#GP;2RWBY~*6kj${;8ELHfujqXC=hIKYEUy|Y^K03-&}!ddkY2n?v@JVmbP+8 zV`~9k#MX&z9E#Xhff#N(1^)1U1|53h_6E(l?fwd6qXszCF;Ibg`wj*vdALCaIccgL z6(~FHq`-5&vqKew6`0j_F=!6U?W#cRGem(U@=yg55W^hm7_LBv8{v>eSAn=B960}> zz-1rh!1)h@W`VW40{vGvoFp24FX zIRBwQG3*!xy3er&2`jJT6qvk@S0KJR!60SyInkktlN7jCV;#yk*`bP46o@iTRiH3( znnNc}ci{b31{vYUnGRK)r9h?OY=<(=QQ)$l>yX8H3aoL?S0Eg`K!E}ILW3q1c98;4 z?ZpmBT%y2$e5pekmpSBdxdJ(|D-@`!U8z79y~>~|b6oAviE9)Xkgs*%3+M_gWv&NW zUuxhjn`HHG(0`|S1C~P_Hz{y^Z+1xI76mE?w&*t9!o#-#~ZZ1c(Al#fKCK2p?7;m->hTo4Fo!;0;&`Bqtv? zsF=QSm=Jg>kG#TN{kvo9|y@UXq)P{zv& zOscOq@C|f_I$l%YTD|U2#~TV{W!_XEwtUMWiMJh!ct?S-=v@VNf4nEash>`~?~ud? z4pcxC2s%GfV4?A`Lmi(ekfQrkfqCsS1uo*}4txPbfjrKa4rzR4(BV=0+M$SV6v)?p ztH65uI|X8Z?;SV+qQKbxqXQKXha!Go$YX1VGPY5mPPVNAIhO4VYH}?79J1J6fki`q1&Y4|9C!nkLlrwX z@C|eYhVUI7irC4ajGYzOFEUtxVQCi!-hkzh#SjIasG$N`bBJ%40)xzOhfa)8U{T$5 z$RZS&Yezb80>q(;-5l!JU4hm19u8^j>A(pP1+w~kJ5;fc0-5T49jJgf@CGc0G7eB6 zbUx656Ce(J0mPw-Lma4pC=gj5=0F9+pm_)X5e``#=}^UJ1(r)kIplG)0x`oF1@fZD zIAn3G0{!YZ1(G$#J7jT!Llq}F@CGae?)g}UBu;k7;}nN7PIcf7SPBG?r#o;0#2}$! zd!_>O`B@4C^k*xuv+o=Qg1d7SsMwyTK!kk00-gE-2Pz;2&6$yl9E!Nup^Qr$s<_ml zj>{A%dta`=fP95R8CN>+4Ri%Y>Z=`+xW<7BhyxW61?tGxD=_Zfpg=Hjqd`-0yUBq! zU^(P*ivqQ{TNMb^Zc`wIbh|?ycR26`5CyvET@F>;t-vzl9)sp6^t}rF!22AMxZfd< z2NZZW#e)iDiXL((;$a2W29GH4a6W3#97cRhfppvB3e?q}P$0N_(xHr}9QXpd0x8yK z9H@UNaIKzm$m4khR;e#2aIIc+;0quQyaCIgWZHg3fs)Xx3S=N(Q=szvx&mvQHx$^J z@uoq=(D#-C>)N*!=U=MmX@D5 zr17Z(CqNXa8+`7N#TO1me5pWC^p!&zUn_9%<{Jf;uirW(@ts2v-z)HZ{2;&y$s&GK zVCMYEfiHk4(0zVUARPQvfs=Q?DbRg>H~8-jSPpsoslZeFmjhowcc|hY1@@Bvt3ak` zoN3zikG#VMjByos&c{=rwmrTADenm!_yULmCAx_mvY6PQ2~sCfVE&j?fjmPWhbksh zpwu|I0=4Za9MYK5fj3|oP?wFV9ryyeLR0@xV1YHALmJaN`#QNTWxAUea;M zB2i$kL8?IJE_3Ka?!Xs76c{o}hbk%s#**5hilr3D4lZrbeCT5t1;+Me9m-fvfhTHt z1^&7#C@_St=#a-s4t1<-z(?6)6^A5Nb;x2h1@fY+JMaY%ha%Q=sADY!qSv(*=+x^d z5M`|Ez!%UJIBT)KLm3+=&`UOS-~@<4&BU^?Ll&DTFobWaz*w@G18=}mV365Df!f5D z3ItwT3Glinz5t@YRJ4r)^$!JBMcX;l(NBSQg>0|D9qg~bU^&2`dF8=C1@dq^I8-r6 zfe3j=2kIXVRqU+5l5DU;8oM~;v8w_(nIR5o3{@Z&9p+HPaECfZC@@lY9nuH|LY$Ef zRg7}r4Oj{cmb*LDv4=sze7>gwU34!6rn|izI02$SDr{c`Ci?vx^4MR2_jnwjz%M`0 zA&-L$QueDKtU%WJ5QicTRUq~`Oo0jVa0gC+DDc!C>5#=}1Ku$lM=3CW9Ie3oF~*^e zV-)BN$2#x@5Cw7o$2%l(f@u&mmKNN_$ZUV0Kw|4dgXR^gA1P21 z{#bzxv7ac=OFnhr3+M`TpU(|)W`i#r^7vALgy~lf)ISs`pM9gi(S>gnSP6fpz=HRC z1@6ZW3WNqf8Z-wUe{#s;X9bevzbFt!{i;CO={JWges|yvSPn(}sX+DhF9inuza6sp z$DxRS6-XzHGi|&6Q5Ia?aUD4SVNf!bjPF4GLxG2LLIt9Xi5xmHu>#j>5(nxZ3e@iU zC{Vkb%z^rc1N9FF-hidRv9zfSDztw7y*8U@DPX%)DG(-|}`FqmF}foui^j&seZ zz|1+50?)_H3ZyJ%amZs<1$z5z4rR>lz#Fg>h%D!H$YL&sGUgT_I`4^j9C!nk0#|oF z1*!n^D-b6xpujy}&>@e76u5{BJCw1A1Lr>!*w3(-L5Do`;tE8TODM3w>Z?F#u%rWD z05M1iaXJd56A}fMeW?TIKNJYha|K=&R4CBfO9$#73Ivd~Lmo>hkV#$Ip^9Y`SOP3- z&{SWSbLhnK4r#2Qz|~z*fwl8W3armpR$w4o#epw?7&M=dTTOw#)#?ts0ZW02WlaTQ zxV01*WY#vw2~yWlAf>mi0t3K$3iPY>9ryyeLme9`aM?FfAhO)ppkON6#DMp%$EFT^ z0o{Q&U@1_W*g}CJV@n18R$D0$>2K}87tj?5Ik#0HTf3b>#jWkyKdy?57^C zKxTV{17849;0K079wP-f3sl4?1(q4RDUf2_-64xT9Lm^Jfq;H5hcfmy=x{)C9|i8m zz7AFFr@#twe}^ItaH!)z1-7Ieq(Iv6U(`oa}|h|&QoBOdcFd4 z)CCUIKMb0;t6$`h$HfkHT%tg4ztn;A9|~+Xx?F*h&=n48TwtpAJ;1|w%?#YuJ1;JiuK@43M6H3R$z5^ivwQ(ao`Ie3IyA?8}JJExI=+; z?VS!)+@-+pxLbjMQIXF3cRb|6nOCq1_{IVMFrBO zOA3^!E-R3jTv1@;yy{Si-wg^BovtZxs_bzmti30h@ zp9WuV08t>O{!4-VQGY8?b$;qliDwFwik~Zx>c23^j>UMXKx25NKo#e;1M?pWJkI|Z zWa~xWIHd7bfoYC^6-bxfITYi)12=#uFwp;}Kq2LW0w3_BL6vX!lLFz#X9c3ZFAmjc z)c(8qk92sI{G+i0^B)Qf^i3SN0YrdV@?tbq;E8JHkVbO_hKUvq+<E35iL22j)K%=u0{%(2;d?C`Km*DifU* z$nLs0a09vm?W(ImmU(qkU{zRm2j)K<%JHKDp;}LaY?E>?2hP7zU_9@wK!@DNp%i@; z=#cvNkPTi#6^Ki63Up+{6sTnkcgRQHAf=v| zI8>vcz;9kuppsfrAZ{-^q>(DH`MqLLA#SfK5VwzTD8xvIQjAhy`fRjAF~&Hs|3iWO zwPOvkF>0Iw>C$)wmPSlapaYnwK(sW;f%C5%sxeuB!{F1HqQDZtsS31`X$pM1(-nx8 zW*FodSY|3Pe#}y!1DLHqWI4wniMa*|L(x2kV$4@yif(}drI&>c+<J3Y-V9)PWn&75D|0Ik5l3A&nXZ`m5y*Nvu#{X+*6-ktb@U0t3q`hjP?8 zRARLPf$$mya<#P%+yJ6LoqD|jvr8Kkn3mmWP~w=YpA}esQLjLnu}Oi{a!r2XL>aH_IIjn gO~Zzbbt|I2W_w*UtlLz7`c%_=u9QnoZ>!Dy4-t7#cK`qY literal 0 HcmV?d00001 diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000..141eb9c --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,15 @@ +# access the current workspace +kubectl exec -it -n bioinformatics $(kubectl get pod -l app=digital-patient-nextflow -n bioinformatics | grep Run | awk '{ print $1 }') -- bash +nextflow run test.nf -profile k8s + +# DEPRECATE: cleanup error pods in bioinformatics +kubectl get pod -n bioinformatics | grep -E "Pending|Error" | awk '{print $1}' | xargs -P 10 -I {} kubectl delete pod -n bioinformatics {} + +# sync data (/data/bugra/similarity-search/utility/vec_db/nf_fingerptint) from node to workspace +kubectl delete -f k8s/job-copy-node-to-pvc.yaml; kubectl apply -f k8s/job-copy-node-to-pvc.yaml + +# run the actual nextflow jobs +kubectl apply -f k8s/job-nextflow-digital-patient.yaml + +# [WARNING] destroy the curent nextflow job +kubectl delete -f k8s/job-nextflow-digital-patient.yaml \ No newline at end of file diff --git a/k8s/deployment-worksplace.yaml b/k8s/deployment-worksplace.yaml new file mode 100644 index 0000000..cfe3473 --- /dev/null +++ b/k8s/deployment-worksplace.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: digital-patient-nextflow + namespace: bioinformatics +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: digital-patient-nextflow + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + creationTimestamp: null + labels: + app: digital-patient-nextflow + spec: + containers: + - command: + - sleep + - infinity + image: nextflow/nextflow:25.04.6 + imagePullPolicy: IfNotPresent + name: nextflow + workingDir: /mnt/dreamdock-data/digital-patient-data + resources: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /mnt/Avatar + name: avatar-new-volume + - mountPath: /mnt/dreamdock-data/ + name: dreamdock-volume + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: nextflow-sa + serviceAccountName: nextflow-sa + terminationGracePeriodSeconds: 30 + volumes: + - name: avatar-new-volume + persistentVolumeClaim: + claimName: avatar-new # CHANGE ME, RELATE TO nextflow.params.input + - name: dreamdock-volume + persistentVolumeClaim: + claimName: dreamdock-data # CHANGE ME, RELATE TO nextflow.params.input +# --- +# apiVersion: v1 +# kind: PersistentVolumeClaim +# metadata: +# name: digital-patient-data +# namespace: bioinformatics +# spec: +# accessModes: +# - ReadWriteMany +# resources: +# requests: +# storage: 6000Gi +# storageClassName: truenas-nfs \ No newline at end of file diff --git a/k8s/gpu-test.yaml b/k8s/gpu-test.yaml new file mode 100644 index 0000000..90c599b --- /dev/null +++ b/k8s/gpu-test.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-test-pod + namespace: bioinformatics +spec: + restartPolicy: Never + nodeSelector: + gpu: 'yes' + gpu-type: 'geforce-rtx-3090' + containers: + - name: cuda-test + # Using a standard, lightweight NVIDIA CUDA image to test the runtime + image: nvidia/cuda:11.8.0-base-ubuntu22.04 + command: ["nvidia-smi"] + resources: + limits: + nvidia.com/gpu: 1 diff --git a/k8s/job-copy-node-to-pvc.yaml b/k8s/job-copy-node-to-pvc.yaml new file mode 100644 index 0000000..40772fb --- /dev/null +++ b/k8s/job-copy-node-to-pvc.yaml @@ -0,0 +1,54 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-data-copy-digital-patient-node-to-pvc + namespace: bioinformatics +spec: + backoffLimit: 6 + completionMode: NonIndexed + completions: 1 + manualSelector: false + parallelism: 1 + suspend: false + template: + spec: + containers: + - args: + - | + apk add --no-cache rsync + mkdir -p /target + rsync -av --exclude .git/ --exclude .git-old/ --exclude work/ /source/ /target/digital-patient-data + command: + - /bin/sh + - -c + image: alpine + imagePullPolicy: Always + name: data-copy + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /source + name: k8s-node + - mountPath: /target + name: pvc-volume + dnsPolicy: ClusterFirst + nodeSelector: + kubernetes.io/hostname: k8s-node23 + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: omic-app + operator: Equal + value: similarity-search + volumes: + - name: pvc-volume + persistentVolumeClaim: + claimName: dreamdock-data + - hostPath: + path: /data/bugra/digital_patient ## CHANGE ME + type: "" + name: k8s-node \ No newline at end of file diff --git a/k8s/job-nextflow-digital-patient.yaml b/k8s/job-nextflow-digital-patient.yaml new file mode 100644 index 0000000..34f4d72 --- /dev/null +++ b/k8s/job-nextflow-digital-patient.yaml @@ -0,0 +1,52 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job-nextflow-digital-patient + namespace: bioinformatics +spec: + backoffLimit: 1 + completionMode: NonIndexed + completions: 1 + manualSelector: false + parallelism: 1 + podReplacementPolicy: TerminatingOrFailed + suspend: false + template: + spec: + containers: + - command: + - /bin/bash + - -c + - cd /mnt/dreamdock-data/digital-patient-data && nextflow run test.nf -profile k8s + image: nextflow/nextflow:25.04.6 + imagePullPolicy: IfNotPresent + name: nextflow + resources: + limits: + cpu: "4" + memory: 8Gi + requests: + cpu: "2" + memory: 4Gi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /mnt/Avatar + name: avatar-nas-volume + - mountPath: /mnt/dreamdock-data/ + name: dreamdock-volume + dnsPolicy: ClusterFirst + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + serviceAccount: nextflow-sa + serviceAccountName: nextflow-sa + terminationGracePeriodSeconds: 30 + volumes: + - name: avatar-nas-volume + hostPath: + path: /mnt/Avatar-NAS + type: Directory + - name: dreamdock-volume + persistentVolumeClaim: + claimName: dreamdock-data # CHANGE ME, RELATE TO nextflow.params.input diff --git a/main_borzoi.nf b/main_borzoi.nf new file mode 100644 index 0000000..0c9af62 --- /dev/null +++ b/main_borzoi.nf @@ -0,0 +1,448 @@ +nextflow.enable.dsl=2 + +process FILTER_VCF { + memory 4.GB + + container "${params.container_borzoi}" + containerOptions "${params.containerOptions}" + debug true + // maxForks 1 + + input: + path MANE + path vcf_filtered + + output: + path "*.vcf" , emit: expression_output + + script: + """ + #!/opt/conda/envs/borzoi/bin/python + #Filter VCF to extract only mutations in coding sequences plus 1000 upstream regulatory elements + import numpy as np + import pandas as pd + import pickle + + #If ncbiRefSeq data need to be filtered in different manner change commented part + #This part is needed if we want to change ncbiRefSeq filtering + #column_names = [ + # "bin", "name", "chrom", "strand", "txStart", "txEnd", + # "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds", + # "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames" + #] + + #ncbiRefSeq = pd.read_csv("{ncbiRefSeq}", sep='\t', names=column_names) #dollar sign is missing + + ##Get onyl protein coding genes (ncbiRefSeq['cdsStartStat'] != 'none') + #ncbiRefSeq = ncbiRefSeq[ncbiRefSeq['cdsStartStat'] != 'none'] + + ##Remove duplicates + #ncbiRefSeq = ncbiRefSeq.drop_duplicates() + + ##Filter out isoforms for longest DNA sequenc. Filter by protein name and chromosom(differnet version of some shromosoms) + #ncbiRefSeq_filtered = [] + #for i in np.unique(ncbiRefSeq['name2']): + # prot = ncbiRefSeq[ncbiRefSeq['name2'] == i] + # for j in np.unique(prot['chrom']): + # prot_chr = prot[prot['chrom'] == j] + # d1 = [prot_chr['txEnd'].astype(int).max(), prot_chr['txStart'].astype(int).min()] #get lowest starting position and maximum stop position + # max_len = np.argmax(prot_chr['txEnd'].astype(int) - prot_chr['txStart'].astype(int)) + # position = prot_chr.iloc[max_len] + # position['txEnd'] = str(d1[0]) #end for isoform can be loaded from exonEnds + # position['txStart'] = str(d1[1]) #start for isoform can be loaded from exonStarts + # ncbiRefSeq_filtered.append(position) + # + #ncbiRefSeq = pd.DataFrame(ncbiRefSeq_filtered).reset_index().drop('index', axis = 1) + ##protrin coding genes only 23314, all genes 48420 + + #Load ncbiRefSeq filtered + MANE_data = pd.read_csv("${MANE}", sep = '\t') + + #Load vcf + with open("${vcf_filtered}") as f: + lines = f.readlines() + + #Remove header/start of vcf file + vcf = [] + for i in lines: + if i[:2] != '##': + vcf.append(i[:-1].split('\t')) + + vcf = pd.DataFrame(vcf[1:], columns=vcf[0]) + + # Assuming 'vcf' and 'ncbiRefSeq' are already loaded DataFrames. + vcf['POS'] = vcf['POS'].astype(int) + vcf['#CHROM'] = vcf['#CHROM'].astype('category') + MANE_data['chrom'] = MANE_data['chrom'].astype('category') + MANE_data_renamed = MANE_data.rename(columns={'chrom': '#CHROM', 'chr_start': 'Start', 'chr_end': 'End', 'chr_strand': 'strand'}) + relevant_chromosomes = vcf['#CHROM'].unique() + MANE_data_renamed = MANE_data_renamed[MANE_data_renamed['#CHROM'].isin(relevant_chromosomes)] + + dataset = [] + + #Filter VCF + for chrom in relevant_chromosomes: + chrom_MANE = MANE_data_renamed[MANE_data_renamed['#CHROM'] == chrom] + # Find the indices in `vcf` that match the conditions for the current chromosome + for name, strand, start, end in chrom_MANE[['symbol', 'strand', 'Start', 'End']].itertuples(index=False): + condition = (vcf['#CHROM'] == chrom) & (vcf['POS'] >= start) & (vcf['POS'] <= end) + if np.sum(condition) != 0: + if strand == '+': + start_ = start - 1000 + end_ = end + else: + end_ = end + 1000 + start_ = start + dataset.append([[name, strand, start_, end_], np.array(vcf[condition]).tolist()]) + #sort vcf by length + len_ = [] + for vcf_entery in dataset: + len_.append(vcf_entery[0][3] - vcf_entery[0][2]) + dataset = [dataset[index] for index in np.argsort(len_)] + #save vcf + #get name + patient_name = "${vcf_filtered}".split('/')[-1].split('_variants.vcf')[0] + with open(f"{patient_name}_rna_coding_vcf_with_mutations.vcf", "wb") as fp: + pickle.dump(dataset, fp) + """ +} + +process PREDICT_EXPRESSION { + accelerator 1 + memory 4.GB + + container "${params.container_borzoi}" + label 'gpu_process' + // containerOptions "${params.containerOptions_borzoi}" + // debug true + maxForks 1 // TODO: COMMENT THIS IN KUBERNETES + + input: + path vcf_filtered + path MANE + //path ncbiRefSeq_bigger + //path ncbiRefSeq_subset + + output: + path "*_TPM.csv", emit: expression_output + + script: + """ + #!/opt/conda/envs/borzoi/bin/python + #Predict expression based on mutation + import json + import os + import time + import warnings + import pickle + from itertools import compress + + import h5py + import matplotlib.pyplot as plt + import matplotlib.patches as patches + import numpy as np + import pandas as pd + import pysam + import pyfaidx + import tensorflow as tf + + from baskerville import seqnn + from baskerville import gene as bgene + from baskerville import dna + + import sys + sys.path.append( '/home/omic/borzoi' ) + from examples.borzoi_helpers import * + + #Load VCF and mutation files + #ncbiRefSeq_bigger = pd.read_csv("{ncbiRefSeq_bigger}") + #with open("{ncbiRefSeq_subset}", "rb") as fp: + # ncbiRefSeq_subset = pickle.load(fp) + prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv") + with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp: + prot_subset = pickle.load(fp) + with open("${vcf_filtered}", "rb") as fp: + vcf_file = pickle.load(fp) + MANE_data = pd.read_csv("${MANE}", sep = '\t') + + batch_size = 4 + #Define batch size and stop if batch size is to small for big transcripts + #min_batch_size = max(((np.array(ncbiRefSeq_bigger['txEnd']).astype('int')-np.array(ncbiRefSeq_bigger['txStart']).astype('int'))/524288).astype('int')+1) + #if min_batch_size > batch_size: + # print('batch size is ',batch_size,'and min_batch_size is ',min_batch_size) + # sys.exit('Batch size has to be bigger or same as number of big trnascripts split into chunks') + + #Model configuration + params_file = '/home/omic/borzoi/examples/params_pred.json' + targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt + n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'. + rc = True #Average across reverse-complement prediction + + #Read model parameters + with open(params_file) as params_open : + params = json.load(params_open) + params_model = params['model'] + params_train = params['train'] + + #Read targets + targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') + target_index = targets_df.index + + #Create local index of strand_pair (relative to sliced targets) + if rc : + strand_pair = targets_df.strand_pair + target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())} + slice_pair = np.array([ + target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist() + ], dtype='int32') + + #Initialize model ensemble + models = [] + for fold_ix in range(n_folds) : + model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5" + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file, 0) + seqnn_model.build_slice(target_index) + if rc : + seqnn_model.strand_pair.append(slice_pair) + seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0] + models.append(seqnn_model) + fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa') + + #Create mutations(s) from WT + def create_mut(sequence_one_hot, poses, alts, start): + sequence_one_hot_mut = np.copy(sequence_one_hot_wt) + for pos, alt in zip(poses, alts) : + alt_ix = -1 + if alt == 'A' : + alt_ix = 0 + elif alt == 'C' : + alt_ix = 1 + elif alt == 'G' : + alt_ix = 2 + elif alt == 'T' : + alt_ix = 3 + sequence_one_hot_mut[pos-start-1] = 0. + sequence_one_hot_mut[pos-start-1, alt_ix] = 1. + return sequence_one_hot_mut + + #Make predictions/ run model + def predict_tracks(models, sequence_one_hot): + predicted_tracks = [] + for fold_ix in range(len(models)): + yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") + predicted_tracks.append(yh) + predicted_tracks = np.concatenate(predicted_tracks, axis=1) + return predicted_tracks + + #calculate TPM from borzoi + def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False): + TPM_list = [] + #loop over all protrin in cluster + for i in range(len(cluster)): + #get exon start and end + ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')] + ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')] + #exon bool mask + exon_mask = np.zeros(borzoi_data.shape[-2]) + for s,n in zip(ex_st,ex_en): + exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n)) + #protrin TPM per person per tissue + TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1) + #Plot proteins with exon marks if needed + if plot == True: + #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue + plt.plot(borzoi_data[0,:,0]) + plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height') + plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height') + plt.xlim(ex_st[0]-100, ex_en[-1]+100) + plt.show() + #Get average for tissue replicates + TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])] + TPM_list.append(TPM_per_tissue) + #cretae Datafreame + TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0]) + return(TPM_dataframe) + + #Protrin cluster list of list + protein_clusters = [np.array(i[2]) for i in prot_subset] + #Get all proteins from VCF + proteins_with_mutations = [i[0][0] for i in vcf_file] + + proteins_with_mutations_working = proteins_with_mutations + + TPM = [] + #run until the expression of all proteins is predicted + while len(proteins_with_mutations_working) > 0: + TPM_dfs = [] + sequences_one_hot_muts = [] + st = [] + cl = [] + #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) + while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0: + #get work protein + protein = proteins_with_mutations_working[0] + #print(protein) + #get cluster + mask = [protein in i for i in protein_clusters] + cluster = list(compress(protein_clusters, mask)) + #run borzoi for big proteins + if protein in np.array(prot_bigger['symbol']): + sequences_one_hot_muts_big = [] + proteins_with_mutations_working = proteins_with_mutations_working[1:] + protein_data = prot_bigger[prot_bigger['symbol'] == protein] + prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000 + prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000 + mutations_ = np.array(list(compress(vcf_file, np.array(proteins_with_mutations) == protein))[0][1]) + #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover + mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]] + chrom = mutations_[0,0] + all_poses = mutations_[:,1].astype('int') + all_alts = mutations_[:,4] + star = prot_start + st_big = star + while star < prot_end: + end = star + 524288 + poses = all_poses[(all_poses > star) & (all_poses < end)] + alts = all_alts[(all_poses > star) & (all_poses < end)] + sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) + sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star) + sequences_one_hot_muts_big.append(sequence_one_hot_mut) + star = end - (32*32) + sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big) + #if number of protein splits is begger than batch size + #print(sequences_one_hot_muts_big.shape) + if sequences_one_hot_muts_big.shape[0] > batch_size: + borzoi_pred_list = [] + for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)): + borzoi_pred_list.append(predict_tracks(models, seq_slice)) + y_mut = np.concatenate(borzoi_pred_list) + else: + y_mut = predict_tracks(models, sequences_one_hot_muts_big) + y_mut = np.reshape(y_mut, [1,1,-1,89]) + TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df)) + #np.save('expression_predictions_%s.npy' %protein, y_mut) + else: + #append to a list of proteins to run + #get star and end of the cluste + star, end = (list(compress(prot_subset, mask))[0][:2]) + #get mutated proteins in the cluster + mask = [i in cluster[0] for i in proteins_with_mutations_working] + proteins_in_cluster = list(compress(proteins_with_mutations_working, mask)) + #remove cluster proteins from the ptoein list + proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask))) + #print(proteins_in_cluster) + mutations_ = [list(compress(vcf_file, [np.array(proteins_with_mutations) == i][0]))[0][1] for i in proteins_in_cluster] + mutations_ = np.concatenate([np.array(i) for i in mutations_],0) + #only use one reference chromosom, in case mutations are maped to multiple chr/ cosenquence of lifover + mutations_ = mutations_[[i[0]== mutations_[0,0] for i in mutations_]] + chrom = mutations_[0,0] + poses = mutations_[:,1].astype('int') + alts = mutations_[:,4] + sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) + sequence_one_hot_mut = create_mut(sequence_one_hot_wt, poses, alts, star) + sequences_one_hot_muts.append(sequence_one_hot_mut) + st.append(star) + cl.append(cluster) + ### Test wt + #sequences_one_hot_muts.append(sequence_one_hot_wt) + ### + sequences_one_hot_muts = np.array(sequences_one_hot_muts) + #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step) + if sequences_one_hot_muts.shape != (0,): + y_mut = predict_tracks(models, sequences_one_hot_muts) + for i in range(len(y_mut)): + TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df)) + TPM_dfs = pd.concat(TPM_dfs) + TPM.append(TPM_dfs) + #np.save('expression_predictions_%s.npy' %protein, y_mut) + TPM = pd.concat(TPM) + + #add RNA expression for positions with no mutations + #load reference genome expression + tpm_no_mut = pd.read_csv('/home/omic/borzoi/TPM_NO_MUTATIONS.csv',index_col=0) + #concat missing expression + TPM = pd.concat([TPM, tpm_no_mut.loc[list(set(tpm_no_mut.index) - set(TPM.index))]]) + #change symbol to ENSG + TPM.index.names = ['RNA'] + MANE_data['ENSG'] = [i.split('.')[0] for i in MANE_data['Ensembl_Gene']] + mane_map = MANE_data[['symbol','ENSG']] + TPM = mane_map.merge(TPM, left_on='symbol', right_on='RNA').dropna().drop_duplicates(subset=['symbol']) + #drop duplicates + TPM =TPM.drop_duplicates() + TPM = TPM.iloc[:,2:].set_index(TPM.iloc[:,1]) + #save TPM + #get name + patient_name = "${vcf_filtered}".split('/')[-1].split('_rna_coding_vcf_with_mutations.vcf')[0] + TPM.to_csv(f'{patient_name}_TPM.csv') + + gpu_stats = tf.config.experimental.get_memory_info('GPU:0') + print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB") + print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB") + """ +} + + +process CREATE_PROTEIN_CLUSTER { + container "${params.container_borzoi}" + containerOptions "${params.containerOptions}" + debug true + + input: + path MANE + + output: + path "prot_bigger.csv" + path "prot_subset.pickle" + + script: + """ + #!/opt/conda/envs/borzoi/bin/python + # Use this if new RNAs transcripts need to be added to list of predicted RNAs + import numpy as np + import pandas as pd + import pickle + + data = pd.read_csv("${MANE}", sep='\t') + + data_subset = [] + data_bigger = [] + for ch in np.unique(data['chrom']): + data_working = data[data['chrom']==ch] + arg = np.argsort(np.array(data_working['chr_start']).astype('int')) + data_working = data_working.iloc[arg] + #protrin start poistion wirh regulatori elements negativ 1000 from start for + strand, plus 1000 from end for - strand + #add (16*32) as buffer because borzoi will cut tham + prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000 + prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000 + input_start = prot_start + input_end = input_start + 524288 + while len(input_start) > 0: + st = input_start[0] + en = input_end[0] + mask = (prot_start >= st) & (prot_end <= en) + data_mask = data_working[mask] + data_subset.append([st, en, data_mask['symbol']]) + #ncbiRefSeq_subset.append(ncbiRefSeq_working[mask]) + if mask[0] == False: + data_bigger.append(data_working.iloc[0]) + mask[0] = True + data_working = data_working[~mask] + arg = np.argsort(np.array(data_working['chr_start']).astype('int')) + data_working = data_working.iloc[arg] + prot_start = np.array(data_working['chr_start']).astype('int') - (16*32) - np.array(data_working['chr_strand'] == '+') * 1000 + prot_end = np.array(data_working['chr_end']).astype('int') + (16*32) + np.array(data_working['chr_strand'] == '-') * 1000 + input_start = prot_start + input_end = input_start + 524288 + + prot_bigger = pd.DataFrame(data_bigger) + + prot_bigger.to_csv('prot_bigger.csv') + with open("prot_subset.pickle", "wb") as fp: + pickle.dump(data_subset, fp) + + gpu_stats = tf.config.experimental.get_memory_info('GPU:0') + print(f"Current VRAM usage: {gpu_stats['current'] / 1e9:.2f} GB") + print(f"Peak VRAM usage: {gpu_stats['peak'] / 1e9:.2f} GB") + """ +} diff --git a/main_cibersortx.nf b/main_cibersortx.nf new file mode 100644 index 0000000..4465944 --- /dev/null +++ b/main_cibersortx.nf @@ -0,0 +1,260 @@ +nextflow.enable.dsl=2 + +process CONVERT_TO_TXT { + memory 1.GB + + container = "${params.container_ecotyper}" + containerOptions = "${params.containerOptions}" + publishDir "${params.ecotyper_outdir}/fractions", mode: 'copy' + // debug true + + input: + path input_expression + + output: + path '*.txt', emit: input_expression_txt + + script: + """ + #!/bin/bash + name="\$(basename $input_expression .csv)" + sed -E 's/("([^"]*)")?,/\\2\\t/g' $input_expression > \${name}.txt + """ +} + +process CIBERSORTx_FRACTIONS { + memory 4.GB + + container = "${params.container_ecotyper}" + containerOptions = "${params.containerOptions}" + publishDir "${params.ecotyper_outdir}/fractions", mode: 'copy' + // debug true + + input: + path input_expression, stageAs: 'input.txt' + path signature_matrix, stageAs: 'signature.txt' + + output: + path "CIBERSORTx_Results.txt", emit: fractions_results + + script: + """ + #!/bin/bash + set -ex + + echo "Starting CIBERSORTx Fractions analysis..." + mkdir -p /src/data /src/outdir || { echo "Failed to create directories"; exit 1; } + + # Prepare input files + echo "Preparing input files..." + sed 's/"//g' "input.txt" | tr -d '\\r' > /src/data/mixture.txt || { echo "Failed to prepare mixture file"; exit 1; } + sed 's/"//g' "signature.txt" | tr -d '\\r' > /src/data/signature.txt || { echo "Failed to prepare signature file"; exit 1; } + + echo "Running CIBERSORTx Fractions" + /src/CIBERSORTxFractions \\ + --mixture /src/data/mixture.txt \\ + --sigmatrix /src/data/signature.txt \\ + --outdir /src/outdir \\ + --username "${params.cibersortx_username}" \\ + --token "${params.cibersortx_token}" + + if [ -f /src/outdir/CIBERSORTx_Results.txt ]; then + cp /src/outdir/CIBERSORTx_Results.txt . + else + echo "Error: CIBERSORTx_Results.txt not found" + ls -la /src/outdir/ + exit 1 + fi + """ +} + +process CIBERSORTx_HIRES { + memory 1.GB + + container = "${params.container_ecotyper}" + containerOptions = "${params.containerOptions}" + publishDir "${params.ecotyper_outdir}/hires", mode: 'copy' +// debug true + + input: + path expression_matrix, stageAs: 'expression.txt' + path fractions_results, stageAs: 'fractions.txt' + path signature_matrix, stageAs: 'signature.txt' + + output: + path "CIBERSORTx_HiRes_Matrix.txt", emit: hires_matrix + + script: + """ + #!/opt/conda/envs/ecotyper/bin/python3 + import pandas as pd + import numpy as np + import os + import glob + import subprocess + from pathlib import Path + + # Create directories + print("Starting CIBERSORTx HiRes analysis...") + for dir_name in ['/src/data', '/src/outdir', '/src/intermediate', '/src/temp']: + Path(dir_name).mkdir(parents=True, exist_ok=True) + + # Read input files + print("=== READING INPUT FILES ===") + mixture_df = pd.read_csv('expression.txt', sep='\t', index_col=0) + signature_df = pd.read_csv('signature.txt', sep='\t', index_col=0) + fractions_df = pd.read_csv('fractions.txt', sep='\t') + + # Data quality checks + print("=== DATA QUALITY CHECKS ===") + def check_data_quality(df, name): + print(f"Checking {name}:") + print(f" NaN values: {df.isna().sum().sum()}") + print(f" Negative values: {(df < 0).sum().sum()}") + print(f" Zero values: {(df == 0).sum().sum()}") + print(f" Sample stats:") + print(df.describe().round(3)) + + check_data_quality(mixture_df, "Mixture") + check_data_quality(signature_df, "Signature") + + # Clean and normalize gene names + print("=== PROCESSING GENE NAMES ===") + def normalize_gene_name(x): + return str(x).strip().upper() + + mixture_df.index = mixture_df.index.map(normalize_gene_name) + signature_df.index = signature_df.index.map(normalize_gene_name) + + # Find common genes with exact matching + print("=== FINDING COMMON GENES ===") + common_genes = sorted(set(mixture_df.index) & set(signature_df.index)) + print(f"Number of common genes found: {len(common_genes)}") + + if len(common_genes) == 0: + raise ValueError("No common genes found between mixture and signature files") + + # Create aligned matrices with exact same gene order + mixture_filtered = mixture_df.loc[common_genes].copy() + signature_filtered = signature_df.loc[common_genes].copy() + + # Verify gene order matches exactly + if not (mixture_filtered.index == signature_filtered.index).all(): + raise ValueError("Gene order mismatch after filtering") + + # Replace any NaN values with 0 + mixture_filtered = mixture_filtered.fillna(0) + signature_filtered = signature_filtered.fillna(0) + + # Prepare output files + print("=== PREPARING OUTPUT FILES ===") + + # Save mixture file + mixture_filtered.index.name = 'genesinput' + mixture_filtered.to_csv('/src/data/mixture.txt', sep='\t', float_format='%.6f') + + # Save signature file (no transpose) + signature_filtered.index.name = 'genesinput' + signature_filtered.to_csv('/src/data/signature.txt', sep='\t', float_format='%.6f') + + # Process and save weights file + meta_cols = ['Mixture', 'P-value', 'Correlation', 'RMSE'] + data_cols = [col for col in fractions_df.columns if col not in meta_cols] + weights_df = fractions_df[data_cols].copy() + weights_df.fillna(0, inplace=True) + weights_df.to_csv('/src/outdir/CIBERSORTxGEP_NA_Weights.txt', sep='\t', index=False) + + # Print dimensions and verify + print("=== MATRIX VALIDATION ===") + print(f"Matrix dimensions:") + print(f" Mixture: {mixture_filtered.shape[0]} genes x {mixture_filtered.shape[1]} samples") + print(f" Signature: {signature_filtered.shape[0]} genes x {signature_filtered.shape[1]} samples") + print(f" Weights: {len(data_cols)} cols") + + if len(data_cols) < 1: + raise ValueError(f"Invalid weights columns count: {len(data_cols)}") + + # Additional validations + if not np.isfinite(mixture_filtered.values).all(): + raise ValueError("Non-finite values found in mixture matrix") + if not np.isfinite(signature_filtered.values).all(): + raise ValueError("Non-finite values found in signature matrix") + if not np.isfinite(weights_df.values).all(): + raise ValueError("Non-finite values found in weights matrix") + + window_size = len(data_cols) + print(f"Using window size: {window_size}") + + # Run CIBERSORTx + print("=== RUNNING CIBERSORTX HIRES ===") + cmd = [ + '/src/CIBERSORTxHiRes', + '--mixture', '/src/data/mixture.txt', + '--sigmatrix', '/src/data/signature.txt', + '--cibresults', '/src/outdir/CIBERSORTxGEP_NA_Weights.txt', + '--outdir', '/src/outdir', + '--window', str(window_size), + '--username', '${params.cibersortx_username}', + '--token', '${params.cibersortx_token}', + '--QN', 'true' + ] + + result = subprocess.run(cmd, check=True) + + # List all files in output directory + print("=== CHECKING OUTPUT FILES ===") + output_dir = Path('/src/outdir') + print("Files in output directory:") + for f in output_dir.glob('*'): + print(f" {f.name} ({f.stat().st_size} bytes)") + + # Try to find any file that might be our result + gep_files = list(output_dir.glob('*GEP*.txt')) + if not gep_files: + filtered_files = list(output_dir.glob('*Filtered*.txt')) + if filtered_files: + result_file = filtered_files[0] + else: + all_txt_files = list(output_dir.glob('*.txt')) + if all_txt_files: + result_file = max(all_txt_files, key=lambda x: x.stat().st_size) + else: + raise FileNotFoundError("No suitable output files found") + else: + result_file = gep_files[0] + + print(f"Using result file: {result_file}") + subprocess.run(['cp', str(result_file), 'CIBERSORTx_HiRes_Matrix.txt'], check=True) + """ +} + +process ADD_TISSUE_NAMES_TO_CIBERSORTX{ + memory 1.GB + + container = "${params.container_ecotyper}" + containerOptions = "${params.containerOptions}" + publishDir "${params.ecotyper_outdir}/hires", mode: 'copy' +// debug true + + input: + path EXPRESSION_MATRIX + path CIBERSORTx_HIRES + + output: + path '*_immune_cells.csv' + + script: + """ + #!/opt/conda/envs/ecotyper/bin/python3 + import pandas as pd + + cibersort_df = pd.read_csv('${CIBERSORTx_HIRES}', sep='\t') + expression_df = pd.read_csv('${EXPRESSION_MATRIX}', sep='\t', index_col=0) + + pat_name = '${EXPRESSION_MATRIX}'.split('_TPM.txt')[0] + tissue_name = [i.split(':')[1] for i in expression_df.columns] + cibersort_df = cibersort_df.set_index(pd.Index(tissue_name, dtype='str')) + cibersort_df.index.name = 'Tissues' + cibersort_df.to_csv(pat_name + '_immune_cells.csv') + """ +} diff --git a/main_corto.nf b/main_corto.nf new file mode 100644 index 0000000..5c042e8 --- /dev/null +++ b/main_corto.nf @@ -0,0 +1,46 @@ +nextflow.enable.dsl=2 + +process CORTO { + memory 1.GB + + container "${params.container_corto}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}", mode: 'copy' + // debug true + // maxForks 1 + stageInMode 'copy' + + input: + path TPM + path regulon + // path TPM_ref + + // Define output channels + output: + path "*_metabolome.csv", emit: csv_metabol + + script: + """ + #!/usr/bin/Rscript + library(corto) + library(data.table) + + TPM <- as.matrix(fread("$TPM"),rownames=1) + #TPM_ref <- as.matrix(fread("TPM_ref"),rownames=1) #dollar sign is missing + + #try normalization; dont use + #TPM <- scale(TPM) + #TPM_log2 = log2(TPM + 1) + + load("$regulon") + + predicted<-mra(TPM, regulon=regulon) + + #use this if we want to compare patients and reference TPM, data formet output is not csv -> work on data saving if needed + #predicted<-mra(TPM, path TPM_ref, regulon=regulon) + + name = strsplit(strsplit("$TPM", split = "/")[[1]][length(strsplit("$TPM", split = "/")[[1]])], split = "_TPM.csv")[[1]][1] + name = paste(name, "_metabolome.csv", sep="") + write.csv(predicted, name) + """ +} diff --git a/main_filter_outputs.nf b/main_filter_outputs.nf new file mode 100644 index 0000000..d94e1c7 --- /dev/null +++ b/main_filter_outputs.nf @@ -0,0 +1,426 @@ +nextflow.enable.dsl=2 + +// ==================== FILTER VARIANTS ==================== +process FILTER_VARIANTS { + memory 16.GB + + container "${params.container_synthea}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path vcf + + output: + path "*_filtered_variants.vcf", emit: filtered_vcf + path "*_filtered_variants_summary.csv", emit: variants_summary + + script: +""" +#!/opt/conda/envs/synthea/bin/python3 +import pandas as pd +import numpy as np +import sys + +patient_name = "${vcf}".replace('.vcf', '') + +print(f"Processing VCF file: ${vcf}", file=sys.stderr) + +# Read VCF file header +header = [] +with open("${vcf}") as f: + for line in f: + if line.startswith('#'): + header.append(line) + else: + break + +print(f"Header has {len(header)} lines", file=sys.stderr) + +# Count total variants first +total_variants = 0 +with open("${vcf}") as f: + for line in f: + if not line.startswith('#'): + total_variants += 1 + +print(f"Total variants: {total_variants}", file=sys.stderr) + +if total_variants == 0: + print("No variants found, creating empty output", file=sys.stderr) + # Create empty output files + with open(f"{patient_name}_filtered_variants.vcf", 'w') as f: + f.writelines(header) + + pd.DataFrame(columns=['CHROM', 'POS', 'REF', 'ALT', 'QUAL']).to_csv( + f"{patient_name}_filtered_variants_summary.csv", index=False + ) +else: + # Read VCF data in chunks to avoid memory issues + vcf_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'] + chunk_size = 100000 + + print(f"Reading VCF in chunks of {chunk_size}", file=sys.stderr) + + # Read file and process in chunks + df_list = [] + with open("${vcf}") as f: + chunk = [] + for line in f: + if not line.startswith('#'): + chunk.append(line.strip().split('\\t')) + if len(chunk) >= chunk_size: + df_chunk = pd.DataFrame(chunk, columns=vcf_columns) + df_chunk['QUAL_numeric'] = pd.to_numeric(df_chunk['QUAL'], errors='coerce') + df_list.append(df_chunk[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'QUAL_numeric', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']]) + chunk = [] + print(f"Processed {len(df_list) * chunk_size} variants", file=sys.stderr) + + # Process remaining chunk + if chunk: + df_chunk = pd.DataFrame(chunk, columns=vcf_columns) + df_chunk['QUAL_numeric'] = pd.to_numeric(df_chunk['QUAL'], errors='coerce') + df_list.append(df_chunk[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'QUAL_numeric', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']]) + + print(f"Concatenating {len(df_list)} chunks", file=sys.stderr) + df = pd.concat(df_list, ignore_index=True) + + print(f"Sorting by QUAL score", file=sys.stderr) + # Sort by QUAL score (proxy for clinical significance) + df_sorted = df.sort_values('QUAL_numeric', ascending=False) + + # Take top N variants + top_n = min(${params.top_n_variants}, len(df_sorted)) + df_filtered = df_sorted.head(top_n) + + print(f"Selected top {top_n} variants", file=sys.stderr) + + # Write filtered VCF (keep original VCF columns only) + print(f"Writing filtered VCF", file=sys.stderr) + with open(f"{patient_name}_filtered_variants.vcf", 'w') as f: + f.writelines(header) + for _, row in df_filtered.iterrows(): + vcf_line = '\\t'.join(str(row[col]) for col in vcf_columns) + '\\n' + f.write(vcf_line) + + # Create summary CSV + print(f"Writing summary CSV", file=sys.stderr) + summary = df_filtered[['CHROM', 'POS', 'REF', 'ALT', 'QUAL_numeric']].copy() + summary.columns = ['CHROM', 'POS', 'REF', 'ALT', 'QUAL'] + summary.to_csv(f"{patient_name}_filtered_variants_summary.csv", index=False) + + print(f"Done!", file=sys.stderr) +""" +} + +// ==================== FILTER TRANSCRIPTOME ==================== +process FILTER_TRANSCRIPTOME { + memory 2.GB + + container "${params.container_borzoi}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path tpm_file + + output: + path "*_filtered_TPM.csv", emit: filtered_tpm + + script: +""" +#!/opt/conda/envs/borzoi/bin/python +import pandas as pd +import numpy as np + +patient_name = "${tpm_file}".replace('_TPM.csv', '') + +# Read TPM data +df = pd.read_csv("${tpm_file}", index_col=0) + +# Calculate mean expression across tissues +df['mean_expression'] = df.mean(axis=1) + +# Calculate log2 fold change from median (as reference) +median_expr = df.drop('mean_expression', axis=1).median(axis=1) +df['log2FC'] = np.log2((df['mean_expression'] + 1) / (median_expr + 1)) + +# Filter: |log2FC| > 1.5 +# Note: In production, you'd need proper control samples for statistical testing +df['abs_log2FC'] = abs(df['log2FC']) +df_filtered = df[df['abs_log2FC'] > ${params.transcriptome_log2fc_threshold}] + +# Sort by absolute fold change and take top N +df_filtered = df_filtered.sort_values('abs_log2FC', ascending=False) +df_filtered = df_filtered.head(${params.top_n_genes}) + +# Remove helper columns +df_output = df_filtered.drop(['mean_expression', 'log2FC', 'abs_log2FC'], axis=1) + +# Save filtered results +df_output.to_csv(f"{patient_name}_filtered_TPM.csv") +""" +} + +// ==================== FILTER PROTEOME ==================== +process FILTER_PROTEOME { + memory 2.GB + + container "${params.container_rna2protexpression}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path protein_file + + output: + path "*_filtered_Protein_Expression.csv", emit: filtered_proteins + + script: +""" +#!/opt/conda/envs/rna2protexpresson/bin/python3 +import pandas as pd +import numpy as np + +patient_name = "${protein_file}".replace('_Protein_Expression_log2.csv', '') + +# Read protein expression data +df = pd.read_csv("${protein_file}", index_col=0) + +# Calculate mean expression across tissues +df['mean_expression'] = df.mean(axis=1) + +# Calculate 75th percentile threshold +threshold_75 = df['mean_expression'].quantile(0.75) + +# Filter: Expression > 75th percentile +df_filtered = df[df['mean_expression'] > threshold_75] + +# Sort by expression level and take top N +df_filtered = df_filtered.sort_values('mean_expression', ascending=False) +df_filtered = df_filtered.head(${params.top_n_proteins}) + +# Remove helper column +df_output = df_filtered.drop('mean_expression', axis=1) + +# Save filtered results +df_output.to_csv(f"{patient_name}_filtered_Protein_Expression.csv") +""" +} + +// ==================== FILTER IMMUNE CELLS ==================== +process FILTER_IMMUNE_CELLS { + memory 1.GB + + container "${params.container_ecotyper}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path immune_file + + output: + path "*_filtered_immune_cells.csv", emit: filtered_immune + + script: +""" +#!/opt/conda/envs/ecotyper/bin/python3 +import pandas as pd + +patient_name = "${immune_file}".replace('_immune_cells.csv', '') + +# Read immune cell data +df = pd.read_csv("${immune_file}", index_col=0) + +df.to_csv(f"{patient_name}_filtered_immune_cells.csv") +""" +} +// ==================== FILTER METABOLOME ==================== +process FILTER_METABOLOME { + memory 1.GB + + container "${params.container_corto}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path metabolome_file + + output: + path "*_filtered_metabolome.csv", emit: filtered_metabolome + + script: +""" +#!/usr/bin/Rscript +library(data.table) + +patient_name <- gsub("_metabolome.csv", "", "${metabolome_file}") + +# Read metabolome data +df <- fread("${metabolome_file}") + +# Assuming the data has columns: pathway, activity_score, p_value +# Adjust based on actual CORTO output format + +# Filter: p-value < 0.05 (if p-value column exists) +if ("P_value" %in% colnames(df)) { + df_filtered <- df[df\$P_value < ${params.metabolome_pvalue_threshold}, ] +} else if ("p_value" %in% colnames(df)) { + df_filtered <- df[df\$p_value < ${params.metabolome_pvalue_threshold}, ] +} else { + df_filtered <- df +} + +# Sort by activity score (absolute value) +if ("Activity_Score" %in% colnames(df_filtered)) { + df_filtered <- df_filtered[order(-abs(df_filtered\$Activity_Score)), ] +} else if (ncol(df_filtered) > 1) { + # If column names are different, sort by second column (assuming it's the score) + df_filtered <- df_filtered[order(-abs(df_filtered[[2]])), ] +} + +# Take top N pathways +df_filtered <- head(df_filtered, ${params.top_n_metabolites}) + +# Save filtered results +fwrite(df_filtered, paste0(patient_name, "_filtered_metabolome.csv")) +""" +} + +// ==================== FILTER MUTATED PROTEINS ==================== +process FILTER_MUTATED_PROTEINS { + memory 2.GB + + container "${params.container_vcf2prot}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path fasta_file + path filtered_vcf + + output: + path "*_filtered_mutations.fasta", emit: filtered_fasta + + script: +""" +#!/opt/conda/envs/vcf2prot/bin/python3 +import re + +patient_name = "${fasta_file}".replace('_transcript_id_mutations.fasta', '') + +# Read filtered VCF to get list of positions to keep +filtered_positions = set() +with open("${filtered_vcf}") as f: + for line in f: + if not line.startswith('#'): + parts = line.strip().split('\\t') + if len(parts) >= 2: + filtered_positions.add(f"{parts[0]}:{parts[1]}") + +# Read FASTA and filter based on mutations in filtered VCF +sequences = {} +current_header = None +current_seq = [] + +with open("${fasta_file}") as f: + for line in f: + if line.startswith('>'): + if current_header: + sequences[current_header] = ''.join(current_seq) + current_header = line.strip() + current_seq = [] + else: + current_seq.append(line.strip()) + if current_header: + sequences[current_header] = ''.join(current_seq) + +# Write filtered FASTA (keep top N based on filtered variants) +with open(f"{patient_name}_filtered_mutations.fasta", 'w') as f: + count = 0 + for header, seq in sequences.items(): + if count >= ${params.top_n_variants}: + break + f.write(header + '\\n') + # Write sequence in 60 character lines + for i in range(0, len(seq), 60): + f.write(seq[i:i+60] + '\\n') + count += 1 +""" +} + +// ==================== CREATE SUMMARY REPORT ==================== +process CREATE_SUMMARY_REPORT { + memory 1.GB + + container "${params.container_synthea}" + containerOptions "${params.containerOptions}" + publishDir "${params.outdir}/${params.project_name}/filtered", mode: 'copy' + + input: + path filtered_tpm + path filtered_proteins + path filtered_immune + path filtered_metabolome + path filtered_vcf + + output: + path "*_summary_report.json" + + script: +""" +#!/opt/conda/envs/synthea/bin/python3 +import pandas as pd +import json + +patient_name = "${filtered_tpm}".replace('_filtered_TPM.csv', '') + +summary = { + 'patient_id': patient_name, + 'filtering_params': { + 'top_n_genes': ${params.top_n_genes}, + 'top_n_proteins': ${params.top_n_proteins}, + 'top_n_immune_cells': ${params.top_n_immune_cells}, + 'top_n_metabolites': ${params.top_n_metabolites}, + 'top_n_variants': ${params.top_n_variants} + }, + 'filtered_counts': {} +} + +# Count filtered entries +try: + df = pd.read_csv("${filtered_tpm}", index_col=0) + summary['filtered_counts']['genes'] = len(df) +except: + summary['filtered_counts']['genes'] = 0 + +try: + df = pd.read_csv("${filtered_proteins}", index_col=0) + summary['filtered_counts']['proteins'] = len(df) +except: + summary['filtered_counts']['proteins'] = 0 + +try: + df = pd.read_csv("${filtered_immune}", index_col=0) + summary['filtered_counts']['immune_cell_types'] = len(df.columns) +except: + summary['filtered_counts']['immune_cell_types'] = 0 + +try: + df = pd.read_csv("${filtered_metabolome}") + summary['filtered_counts']['metabolic_pathways'] = len(df) +except: + summary['filtered_counts']['metabolic_pathways'] = 0 + +try: + with open("${filtered_vcf}") as f: + count = sum(1 for line in f if not line.startswith('#')) + summary['filtered_counts']['variants'] = count +except: + summary['filtered_counts']['variants'] = 0 + +# Save summary +with open(f"{patient_name}_summary_report.json", 'w') as f: + json.dump(summary, f, indent=2) +""" +} \ No newline at end of file diff --git a/main_no_mutations.nf b/main_no_mutations.nf new file mode 100644 index 0000000..44f7877 --- /dev/null +++ b/main_no_mutations.nf @@ -0,0 +1,208 @@ +nextflow.enable.dsl=2 + +process PREDICT_EXPRESSION_NO_MUTATIONS { + container "${params.container_borzoi}" + containerOptions "${params.containerOptions}" + debug true + maxForks 1 + + input: + path MANE + + output: + path "TPM_NO_MUTATIONS.csv", emit: expression_output + + script: + """ + #!/opt/conda/envs/borzoi/bin/python + #Predict expression of reference genom + import json + import os + import time + import warnings + import pickle + from itertools import compress + + import h5py + import matplotlib.pyplot as plt + import matplotlib.patches as patches + import numpy as np + import pandas as pd + import pysam + import pyfaidx + import tensorflow as tf + + from baskerville import seqnn + from baskerville import gene as bgene + from baskerville import dna + + import sys + sys.path.append( '/home/omic/borzoi' ) + from examples.borzoi_helpers import * + + #Reference protein dna sequence from MANE dataset + prot_bigger = pd.read_csv("/home/omic/borzoi/prot_bigger.csv") + with open("/home/omic/borzoi/prot_subset.pickle", "rb") as fp: + prot_subset = pickle.load(fp) + MANE_data = pd.read_csv("${MANE}", sep = '\t') + + batch_size = 4 + + #Model configuration + params_file = '/home/omic/borzoi/examples/params_pred.json' + targets_file = '/home/omic/borzoi/examples/targets_gtex.txt' #Subset of targets_human.txt + n_folds = 1 #4 #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'. + rc = True #Average across reverse-complement prediction + + #Read model parameters + with open(params_file) as params_open : + params = json.load(params_open) + params_model = params['model'] + params_train = params['train'] + + #Read targets + targets_df = pd.read_csv(targets_file, index_col=0, sep='\t') + target_index = targets_df.index + + #Create local index of strand_pair (relative to sliced targets) + if rc : + strand_pair = targets_df.strand_pair + target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())} + slice_pair = np.array([ + target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist() + ], dtype='int32') + + #Initialize model ensemble + models = [] + for fold_ix in range(n_folds) : + model_file = "/home/omic/borzoi/saved_models/f" + str(fold_ix) + "/model0_best.h5" + seqnn_model = seqnn.SeqNN(params_model) + seqnn_model.restore(model_file, 0) + seqnn_model.build_slice(target_index) + if rc : + seqnn_model.strand_pair.append(slice_pair) + seqnn_model.build_ensemble(rc, [0]) #changed '0' to [0] + models.append(seqnn_model) + fasta_open = pysam.Fastafile('/home/omic/borzoi/hg38.fa') + + #Make predictions/ run model + def predict_tracks(models, sequence_one_hot): + predicted_tracks = [] + for fold_ix in range(len(models)): + yh = models[fold_ix](sequence_one_hot)[:, None, ...].astype("float16") + predicted_tracks.append(yh) + predicted_tracks = np.concatenate(predicted_tracks, axis=1) + return predicted_tracks + + #calculate TPM from borzoi + def CalTPM(borzoi_data, start, cluster, prot_data, targets_df, plot = False): + TPM_list = [] + #loop over all protrin in cluster + for i in range(len(cluster)): + #get exon start and end + ex_st = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonStarts'])[0].split(',')] + ex_en = [(int(i)-(start + (32*16)))//32 for i in np.array(prot_data[prot_data['symbol'] == cluster[i]]['exonEnds'])[0].split(',')] + #exon bool mask + exon_mask = np.zeros(borzoi_data.shape[-2]) + for s,n in zip(ex_st,ex_en): + exon_mask = exon_mask + ((np.arange(borzoi_data.shape[-2]) >= s) & (np.arange(borzoi_data.shape[-2]) <= n)) + #protrin TPM per person per tissue + TPM_per_tissue_replicates = np.sum(borzoi_data[:,exon_mask== 1], axis = 1) + #Plot proteins with exon marks if needed + if plot == True: + #Will plot only first adipose_tissue borzoi_data[0,:,x] change x for different tissue + plt.plot(borzoi_data[0,:,0]) + plt.vlines(x = ex_st, ymin=0, ymax=3.5, colors='red', ls='--', lw=2, label='vline_multiple - full height') + plt.vlines(x = ex_en, ymin=0, ymax=3.5, colors='blue', ls='--', lw=2, label='vline_multiple - full height') + plt.xlim(ex_st[0]-100, ex_en[-1]+100) + plt.show() + #Get average for tissue replicates + TPM_per_tissue = [np.mean(i) for i in np.split(TPM_per_tissue_replicates[0], np.unique(targets_df['description'], return_index=True)[1][1:])] + TPM_list.append(TPM_per_tissue) + #cretae Datafreame + TPM_dataframe = pd.DataFrame(TPM_list,cluster,np.unique(targets_df['description'], return_index=True)[0]) + return(TPM_dataframe) + + #Protrin cluster list of list + protein_clusters = [np.array(i[2]) for i in prot_subset] + + #all proroteins present in dataset + all_reference_proteins = list(pd.concat([pd.concat([i[-1] for i in prot_subset]), prot_bigger['symbol']])) + #use variable names from mutation + proteins_with_mutations = all_reference_proteins + proteins_with_mutations_working = proteins_with_mutations + + TPM = [] + #run until the expression of all proteins is predicted + while len(proteins_with_mutations_working) > 0: + TPM_dfs = [] + sequences_one_hot_muts = [] + st = [] + cl = [] + #append proteins to a list until equal to batch size if protein is smaller, if it's big just run borzoi for it (don't append) + while len(sequences_one_hot_muts) < batch_size and len(proteins_with_mutations_working) > 0: + #get work protein + protein = proteins_with_mutations_working[0] + #print(protein) + #get cluster + mask = [protein in i for i in protein_clusters] + cluster = list(compress(protein_clusters, mask)) + #run borzoi for big proteins + if protein in np.array(prot_bigger['symbol']): + sequences_one_hot_muts_big = [] + proteins_with_mutations_working = proteins_with_mutations_working[1:] + protein_data = prot_bigger[prot_bigger['symbol'] == protein] + prot_start = np.array(protein_data['chr_start']).astype('int')[0] - (16*32) - np.array(protein_data['chr_strand'] == '+')[0] * 1000 + prot_end = np.array(protein_data['chr_end']).astype('int')[0] + (16*32) + np.array(protein_data['chr_strand'] == '-')[0] * 1000 + chrom = protein_data.iloc[0]['chrom'].split('_')[0] + star = prot_start + st_big = star + while star < prot_end: + end = star + 524288 + sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) + sequences_one_hot_muts_big.append(sequence_one_hot_wt) + star = end - (32*32) + sequences_one_hot_muts_big = np.array(sequences_one_hot_muts_big) + #if number of protein splits is begger than batch size + #print(sequences_one_hot_muts_big.shape) + if sequences_one_hot_muts_big.shape[0] > batch_size: + borzoi_pred_list = [] + for seq_slice in np.array_split(sequences_one_hot_muts_big, np.ceil(sequences_one_hot_muts_big.shape[0]/batch_size)): + borzoi_pred_list.append(predict_tracks(models, seq_slice)) + y_mut = np.concatenate(borzoi_pred_list) + else: + y_mut = predict_tracks(models, sequences_one_hot_muts_big) + y_mut = np.reshape(y_mut, [1,1,-1,89]) + TPM.append(CalTPM(y_mut[0], st_big, [protein], MANE_data, targets_df)) + #np.save('expression_predictions_%s.npy' %protein, y_mut) + else: + #append to a list of proteins to run + #get star and end of the cluste + star, end = (list(compress(prot_subset, mask))[0][:2]) + #get mutated proteins in the cluster + mask = [i in cluster[0] for i in proteins_with_mutations_working] + proteins_in_cluster = list(compress(proteins_with_mutations_working, mask)) + #remove cluster proteins from the ptoein list + proteins_with_mutations_working = list(compress(proteins_with_mutations_working, ~np.array(mask))) + chrom = MANE_data[MANE_data['symbol'] == proteins_in_cluster[0]].iloc[0]['chrom'].split('_')[0] + sequence_one_hot_wt = process_sequence(fasta_open, chrom, star, end, seq_len = 524288) + sequences_one_hot_muts.append(sequence_one_hot_wt) + st.append(star) + cl.append(cluster) + ### Test wt + #sequences_one_hot_muts.append(sequence_one_hot_wt) + ### + sequences_one_hot_muts = np.array(sequences_one_hot_muts) + #run borzoi for smaller proteins, if list is empty isn't empty(can be empty for last step) + if sequences_one_hot_muts.shape != (0,): + y_mut = predict_tracks(models, sequences_one_hot_muts) + for i in range(len(y_mut)): + TPM_dfs.append(CalTPM(y_mut[i], st[i], cl[i][0], MANE_data, targets_df)) + TPM_dfs = pd.concat(TPM_dfs) + TPM.append(TPM_dfs) + #np.save('expression_predictions_%s.npy' %protein, y_mut) + TPM = pd.concat(TPM) + TPM.to_csv('TPM_NO_MUTATIONS.csv') + """ +} + diff --git a/main_rna2proteinexpression.nf b/main_rna2proteinexpression.nf new file mode 100644 index 0000000..b76e843 --- /dev/null +++ b/main_rna2proteinexpression.nf @@ -0,0 +1,24 @@ +nextflow.enable.dsl=2 + +process RNA2PROTEXPRESSION { + memory 2.GB + accelerator 1 + + container "${params.container_rna2protexpression}" + label 'gpu_process' + //publishDir "${params.outdir_rna2protexpression}", mode: 'copy' + // debug true + // maxForks 1 + + input: + path borzoi_tpm + + output: + path "*Protein_Expression_log2.csv", emit: protrin_expression_scores + + script: + """ + . activate rna2protexpresson + python3 /home/omic/rna2protexpression/rna2protexpression.py --borzoi_ouput ${borzoi_tpm} + """ +} diff --git a/main_synthea.nf b/main_synthea.nf new file mode 100644 index 0000000..c73bb9b --- /dev/null +++ b/main_synthea.nf @@ -0,0 +1,540 @@ +nextflow.enable.dsl=2 + +process generate_patients { + container "${params.container_synthea}" + containerOptions "$params.containerOptions_synthea" + //publishDir "${params.outdir}/patients", mode: 'copy' + // echo true + // errorStrategy 'ignore' + output: + tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple + path "*" + script: + """ + workdir=`pwd` + cd /synthea + ./run_synthea -p ${params.n_pat} -a ${params.age} --exporter.csv.export true + cd /synthea/output + cp -r * \$workdir + """ +} + +process get_disease_stats_no_patients { + memory 1.GB + accelerator 1 + label 'gpu_process' + + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + // echo true + input: + path pheno_store + path imputed_store + output: + tuple path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import pandas as pd + import subprocess + import os.path + from pathlib import Path + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + + from os import system + system("ls -alh") + system("ls ${imputed_store} | head") + system("ls -alh /mnt/Avatar") + + df_meta = pd.read_csv('${pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python') + + imputed_dir = Path('${imputed_store}') + + for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','): + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80) + #print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz').is_file(): + ukbb_var_m = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_m), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.male.tsv', f'{ukbb_id}_male.tsv']) + elif (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz').is_file(): + ukbb_var_m = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_m), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv', f'{ukbb_id}_male.tsv']) + elif (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz').is_file(): + ukbb_var_m = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_m), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.female.tsv', f'{ukbb_id}_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz').is_file(): + ukbb_var_f = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = f'{ukbb_id}.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_f), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.female.tsv', f'{ukbb_id}_female.tsv']) + elif (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz').is_file(): + ukbb_var_f = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_f), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.both_sexes.tsv', f'{ukbb_id}_female.tsv']) + elif (imputed_dir / f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz').is_file(): + ukbb_var_f = imputed_dir / f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = f'{ukbb_id}.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", str(ukbb_var_f), "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", f'{ukbb_id}.gwas.imputed_v3.male.tsv', f'{ukbb_id}_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process get_disease_stats_cudf { + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + echo true + input: + tuple path(pat_csv), path(cond_csv) + path pheno_store + + output: + tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + + #import pandas as pd + import cudf as pd + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + import numpy as np + import subprocess + import os.path + + ################## IMPORT SYNTHETIC PATIENTS ############################################## + + ## conditions, used to find the right imputed variant file + df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False) + + ## patients + df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False) + try: + df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M'] + except: + 'No men in dataset' + try: + df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F'] + except: + 'No women in dataset' + try: + df_pat_bs = df_pat.loc[df_pat['GENDER'] == ''] + except: + 'No intersex/undefined sex in dataset' + + df_meta = pd.read_csv('${pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file']) + print(df_meta) + + ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id + + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80) + if not ukbb_cond: + raise ValueError(f"No match found for disease '${params.disease}' with score >= 80") + print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process generate_m_variants_cudf { + accelerator 1 + memory 6.GB + + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + // echo true + maxForks 1 + + input: + tuple path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: m_var + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + def process_variants(target_file): + # Create gzip directory if it doesn't exist + os.makedirs('gzip', exist_ok=True) + + # Copy and compress file + subprocess.run(['cp', target_file, 'gzip/'], check=True) + subprocess.run(['gzip', 'gzip/' + target_file], check=True) + print(f"Processing {target_file}.gz") + + # Read the compressed file + df_var_m = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0) + + ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + ##read variants + df_var = df_var_m + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr'] = df_var['chr'].astype('str') + df_var['pos'] = df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + + # Perform the operation on host memory + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = df_var_all[['minor_allele','major_allele']].to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv" + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index() + df_var_final.to_csv(output_file, index=False) + + # Main execution + for target in '${dis_var_m}'.split(): + try: + if not os.path.exists(target): + raise FileNotFoundError(f"Input file {target} does not exist") + process_variants(target) + except FileNotFoundError as e: + print(f"Error: {e}") + raise + except Exception as e: + print(f"Error processing {target}: {str(e)}") + raise + """ +} + + +process generate_f_variants_cudf { + memory 6.GB + accelerator 1 + + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + maxForks 1 + echo true + + input: + tuple path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: f_var + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + def process_variants(target_file): + # Create gzip directory if it doesn't exist + os.makedirs('gzip', exist_ok=True) + + # Copy and compress file + subprocess.run(['cp', target_file, 'gzip/'], check=True) + subprocess.run(['gzip', 'gzip/' + target_file], check=True) + print(f"Processing {target_file}.gz") + + # Read the compressed file + df_var_f = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0) + + ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + ## read variant data + df_var = df_var_f + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr'] = df_var['chr'].astype('str') + df_var['pos'] = df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index() + output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv" + df_var_final.to_csv(output_file, index=False) + + # Main execution + for target in '${dis_var_f}'.split(): + try: + if not os.path.exists(target): + raise FileNotFoundError(f"Input file {target} does not exist") + process_variants(target) + except FileNotFoundError as e: + print(f"Error: {e}") + raise + except Exception as e: + print(f"Error processing {target}: {str(e)}") + raise + """ +} + +process generate_m_healthy_cudf { + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: m_healthy + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt') + + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']] + df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process generate_f_healthy_cudf { + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: f_healthy + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt') + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']] + df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process make_vcfs { + memory 15.GB + + container "$params.container_synthea" + containerOptions "$params.containerOptions_synthea" + //publishDir "${params.outdir}/variants_vcfs", mode: 'copy' + // echo true + maxForks 1 + input: + path patient_vars + path support_dir + output: + path "${patient_vars.baseName}.vcf", emit: patient_vcfs + //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz + //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index + //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv + script: + """ + ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome + #I don't think this is correct \$16 == \$18 cuz we changed csv file format + #it only removes first line from csv + awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv + + ## copy the variant template to the workdir + cp ${support_dir}/vcf/vcf_template.vcf . + + ## print the filtered variants to vcf format + printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf + printf "\\n" >> vcf_template.vcf + awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + + # sort the genome karyotypically (in chromosomal order) and index + #files are sorted already just for security + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf + #Liftover form 19 to 38 standard + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C ${support_dir}/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R ${support_dir}/genome/hg38.fa + + #change this part to save on memory usage + #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz + #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz + #delete unnecessary files + rm vcf_template.vcf + rm ${patient_vars.baseName}_filt.csv + rm ${patient_vars.baseName}_sorted.vcf + rm rejected_variants.vcf + rm *vcf.idx + """ +} diff --git a/main_synthea.nf.bk b/main_synthea.nf.bk new file mode 100644 index 0000000..f123163 --- /dev/null +++ b/main_synthea.nf.bk @@ -0,0 +1,481 @@ +nextflow.enable.dsl=2 + +process generate_patients { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/patients", mode: 'copy' + echo true + // errorStrategy 'ignore' + output: + tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple + path "*" + script: + """ + workdir=`pwd` + cd /synthea + ./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true + cd /synthea/output + cp -r * \$workdir + """ +} + +process get_disease_stats_no_patients { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + echo true + output: + tuple path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import pandas as pd + import subprocess + import os.path + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + + df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python') + + for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','): + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80) + #print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process get_disease_stats_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + echo true + input: + tuple path(pat_csv), path(cond_csv) + + output: + tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + + #import pandas as pd + import cudf as pd + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + import numpy as np + import subprocess + import os.path + + ################## IMPORT SYNTHETIC PATIENTS ############################################## + + ## conditions, used to find the right imputed variant file + df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False) + + ## patients + df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False) + try: + df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M'] + except: + 'No men in dataset' + try: + df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F'] + except: + 'No women in dataset' + try: + df_pat_bs = df_pat.loc[df_pat['GENDER'] == ''] + except: + 'No intersex/undefined sex in dataset' + + df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file']) + print(df_meta) + + ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id + + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80) + print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process generate_m_variants_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/variants", mode: 'copy' + //errorStrategy 'ignore' + echo true + // memory '15.0 GB' + maxForks 1 + input: + tuple path(dis_var_f), path(dis_var_m) + //tuple path(pat_csv), path(cond_csv), path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: m_var + //path "selected_proteins.txt", emit: proteins + //path "selected_proteins_liftover_m.bed", emit: m_bed + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + for target in '${dis_var_m}'.split(" "): + + try: + subprocess.run(['mkdir', 'gzip']) + subprocess.run(['cp', target, 'gzip/']) + subprocess.run(['gzip', 'gzip/' + target]) + print(target,'.gz') + df_var_m = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0) + except: + print("the male variant file does not exist") + + ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + ##read variantes + df_var = df_var_m + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr']=df_var['chr'].astype('str') + df_var['pos']=df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + + # Perform the operation on host memory + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = df_var_all[['minor_allele','major_allele']].to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False) + """ +} + + +process generate_f_variants_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/variants", mode: 'copy' + maxForks 1 + //errorStrategy 'ignore' + echo true + input: + tuple path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: f_var + //path "selected_proteins.txt", emit: proteins + //path "selected_proteins_liftover_f.bed", emit: f_bed + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + for target in '${dis_var_f}'.split(" "): + try: + subprocess.run(['mkdir', 'gzip']) + subprocess.run(['cp', target, 'gzip/']) + subprocess.run(['gzip', 'gzip/' + target]) + print(target,'.gz') + df_var_f = pd.read_csv('gzip/' + target + '.gz', sep='\\t', header=0) + except: + print("the female variant file does not exist") + ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + + ## read variant data + df_var = df_var_f + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr']=df_var['chr'].astype('str') + df_var['pos']=df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + df_var_final.to_csv(target.split('.')[0] + '_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process generate_m_healthy_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: m_healthy + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt') + + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process generate_f_healthy_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: f_healthy + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt') + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process make_vcfs { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/variants_vcfs", mode: 'copy' + echo true + maxForks 1 + input: + path patient_vars + output: + path "${patient_vars.baseName}.vcf", emit: patient_vcfs + //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz + //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index + //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv + script: + """ + ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome + #I don't think this is correct \$16 == \$18 cuz we changed csv file format + #it only removes first line from csv + awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv + + ## copy the variant template to the workdir + cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf . + + ## print the filtered variants to vcf format + printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf + printf "\\n" >> vcf_template.vcf + awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + + # sort the genome karyotypically (in chromosomal order) and index + #files are sorted already just for security + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf + #Liftover form 19 to 38 standard + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa + + #change this part to save on memory usage + #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz + #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz + #delete unnecessary files + rm vcf_template.vcf + rm ${patient_vars.baseName}_filt.csv + rm ${patient_vars.baseName}_sorted.vcf + rm rejected_variants.vcf + rm *vcf.idx + """ +} diff --git a/main_synthea.nf.bk.2 b/main_synthea.nf.bk.2 new file mode 100644 index 0000000..1c710b6 --- /dev/null +++ b/main_synthea.nf.bk.2 @@ -0,0 +1,515 @@ +nextflow.enable.dsl=2 + +process generate_patients { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/patients", mode: 'copy' + // echo true + // errorStrategy 'ignore' + output: + tuple path('*/patients.csv'), path('*/conditions.csv'), emit: patient_tuple + path "*" + script: + """ + workdir=`pwd` + cd /synthea + ./run_synthea -p ${params.n_pat} -a ${params.age} ${params.state} ${params.city} --exporter.csv.export true + cd /synthea/output + cp -r * \$workdir + """ +} + +process get_disease_stats_no_patients { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + echo true + output: + tuple path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import pandas as pd + import subprocess + import os.path + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + + df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file'], engine='python') + + for target in '${params.disease}'.replace("[", "").replace("]", "").replace(" ", "").split(','): + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne(target , df_meta['phenotype_description'], score_cutoff=80) + #print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process get_disease_stats_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + echo true + input: + tuple path(pat_csv), path(cond_csv) + + output: + tuple path(pat_csv), path(cond_csv), path("*_female.tsv"), path("*_male.tsv") + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + + #import pandas as pd + import cudf as pd + from fuzzywuzzy import fuzz + from fuzzywuzzy import process + import numpy as np + import subprocess + import os.path + + ################## IMPORT SYNTHETIC PATIENTS ############################################## + + ## conditions, used to find the right imputed variant file + df_cond = pd.read_csv('${cond_csv}', delim_whitespace=False) + + ## patients + df_pat = pd.read_csv('${pat_csv}', delim_whitespace=False) + try: + df_pat_m = df_pat.loc[df_pat['GENDER'] == 'M'] + except: + 'No men in dataset' + try: + df_pat_f = df_pat.loc[df_pat['GENDER'] == 'F'] + except: + 'No women in dataset' + try: + df_pat_bs = df_pat.loc[df_pat['GENDER'] == ''] + except: + 'No intersex/undefined sex in dataset' + + df_meta = pd.read_csv('${params.pheno_store}', sep=',', delim_whitespace=False, header=None, names=['phenotype_code', 'phenotype_description', 'sex', 'file']) + print(df_meta) + + ## find variant file associated with condition - need the patient's sex m/f/other or unknown = both_sexes - add sex to condition table based on patient id + + # lookup ukbb condition from metadata using fuzzy matching + ukbb_cond = process.extractOne('${params.disease}', df_meta['phenotype_description'].to_pandas(), score_cutoff=80) + if not ukbb_cond: + raise ValueError(f"No match found for disease '${params.disease}' with score >= 80") + print(ukbb_cond) + ukbb_id = df_meta.at[ukbb_cond[2],'phenotype_code'] + + ## get male imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_male.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_m = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_m = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_m, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_m, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_male.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + + ## get female imputed variants, replace if not found + try: + if os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.female.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.female.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.both_sexes.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.both_sexes.tsv', ukbb_id + '_female.tsv']) + elif os.path.isfile('${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz'): + ukbb_var_f = '${params.imputed_store}/' + ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + ukbb_file_f = ukbb_id + '.gwas.imputed_v3.male.tsv.bgz' + subprocess.run(["cp", ukbb_var_f, "."]) + subprocess.run(["bgzip", "-d", ukbb_file_f, "."]) + subprocess.run(["mv", ukbb_id + '.gwas.imputed_v3.male.tsv', ukbb_id + '_female.tsv']) + else: + print('imputed variants not present') + except: + print('no files exist') + """ +} + + +process generate_m_variants_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + echo true + maxForks 1 + + input: + tuple path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: m_var + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + def process_variants(target_file): + # Create gzip directory if it doesn't exist + os.makedirs('gzip', exist_ok=True) + + # Copy and compress file + subprocess.run(['cp', target_file, 'gzip/'], check=True) + subprocess.run(['gzip', 'gzip/' + target_file], check=True) + print(f"Processing {target_file}.gz") + + # Read the compressed file + df_var_m = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0) + + ################## GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + ##read variants + df_var = df_var_m + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr'] = df_var['chr'].astype('str') + df_var['pos'] = df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + + # Perform the operation on host memory + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = df_var_all[['minor_allele','major_allele']].to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv" + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index() + df_var_final.to_csv(output_file, index=False) + + # Main execution + for target in '${dis_var_m}'.split(): + try: + if not os.path.exists(target): + raise FileNotFoundError(f"Input file {target} does not exist") + process_variants(target) + except FileNotFoundError as e: + print(f"Error: {e}") + raise + except Exception as e: + print(f"Error processing {target}: {str(e)}") + raise + """ +} + + +process generate_f_variants_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + maxForks 1 + echo true + + input: + tuple path(dis_var_f), path(dis_var_m) + output: + path "*_variants.csv", emit: f_var + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + import os + import subprocess + import io + import requests + import itertools + import re + import json + + def process_variants(target_file): + # Create gzip directory if it doesn't exist + os.makedirs('gzip', exist_ok=True) + + # Copy and compress file + subprocess.run(['cp', target_file, 'gzip/'], check=True) + subprocess.run(['gzip', 'gzip/' + target_file], check=True) + print(f"Processing {target_file}.gz") + + # Read the compressed file + df_var_f = pd.read_csv('gzip/' + target_file + '.gz', sep='\\t', header=0) + + ##################### GENERATE VARIANTS USING UKBB IMPUTED DATA ############################## + ## read variant data + df_var = df_var_f + + ## calculate major and minor alleles and frequencies + df_var['minor_AF'] = pd.to_numeric(df_var['minor_AF']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + ## split variant into parts + df_var[['chr', 'pos', 'ref', 'alt']] = df_var['variant'].str.split(':', expand=True) + df_var['chr'] = df_var['chr'].astype('str') + df_var['pos'] = df_var['pos'].astype('int') + + ## filter out low confidence variants + df_var_hc = df_var.loc[df_var['low_confidence_variant'] == False] + df_var_all = df_var_hc.drop(['low_confidence_variant'], axis=1) + + ## determine which is the minor allele + # Convert to pandas DataFrame in host memory + df_var_all_host = df_var_all.to_pandas() + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['alt'], 'major_allele'] = df_var_all_host['ref'] + df_var_all_host.loc[df_var_all_host['minor_allele'] == df_var_all_host['ref'], 'major_allele'] = df_var_all_host['alt'] + + # Convert back to cuDF DataFrame in device memory + df_var_all = pd.from_pandas(df_var_all_host) + + ## define variant array and probabilities, patient array + arr_var = pd.DataFrame(data=df_var_all, columns=['minor_allele','major_allele']).to_numpy() + arr_prob = df_var_all[['minor_AF', 'major_AF']].to_numpy() + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + + ## loop through rows, using weighted random choice by variant probability to assign predicted allele + for p in range(n_patients): + df_var_all['prediction'] = arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var_all.loc[df_var_all['prediction'] != df_var_all['ref']] + df_var_final = df_var_final[['chr', 'pos', 'ref', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['chr','pos']).sort_index() + output_file = f"{target_file.split('.')[0]}_patient_{p}_variants.csv" + df_var_final.to_csv(output_file, index=False) + + # Main execution + for target in '${dis_var_f}'.split(): + try: + if not os.path.exists(target): + raise FileNotFoundError(f"Input file {target} does not exist") + process_variants(target) + except FileNotFoundError as e: + print(f"Error: {e}") + raise + except Exception as e: + print(f"Error processing {target}: {str(e)}") + raise + """ +} + +process generate_m_healthy_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: m_healthy + + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.male.txt') + + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XY']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + + n_patients = int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']] + df_var_final.to_csv('healthy_male_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process generate_f_healthy_cudf { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + maxForks 1 + echo true + + input: + tuple path(healthy_dir) + + output: + path "*_variants.csv", emit: f_healthy + script: + """ + #!/opt/conda/envs/synthea/bin/python3 + import cudf as pd + import numpy as np + df_var = pd.read_csv('${healthy_dir}/gnomad.genomes.v4.1.sites.female.txt') + df_var['minor_AF'] = pd.to_numeric(df_var['AF_XX']) + major_af = 1 - df_var[['minor_AF']] + df_var['major_AF'] = major_af + arr_var = pd.DataFrame(data=df_var, columns=['ALT','REF']).to_numpy() + arr_prob = df_var[['minor_AF', 'major_AF']].to_numpy() + n_patients = ${params.n_pat} - int(${params.n_pat} * ${params.percent_male}) + for p in range(n_patients): + df_var['prediction'] =arr_var[np.arange(arr_var.shape[0]), (np.random.rand(arr_var.shape[0], 1) < arr_prob.cumsum(axis=1)).argmax(axis=1)] + df_var_final = df_var.loc[df_var['prediction'] != df_var['REF']] + df_var_final = df_var_final[['#CHROM', 'POS', 'REF', 'prediction']] + #randomly select mutation if more than one mutation is passed for a position + df_var_final = df_var_final.sample(frac=1).drop_duplicates(['#CHROM','POS']).sort_index() + df_var_final.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref"}, inplace = True) + df_var_final['chr'] = [i.split('chr')[-1] for i in df_var_final.to_pandas()['chr']] + df_var_final.to_csv('healthy_female_patient_' + str(p) + '_variants.csv', index=False) + """ +} + +process make_vcfs { + container 'synthea:cudf' + containerOptions "--rm --privileged --gpus all -v /mnt:/mnt" + //publishDir "${params.outdir}/variants_vcfs", mode: 'copy' + echo true + maxForks 1 + input: + path patient_vars + output: + path "${patient_vars.baseName}.vcf", emit: patient_vcfs + //path "${patient_vars.baseName}.vcf.gz", emit: patient_gz + //path "${patient_vars.baseName}.vcf.gz.tbi", emit: patient_index + //path "${patient_vars.baseName}_filt.csv", emit: filtered_csv + script: + """ + ## only print disease-associated variants not found in the ref/major allele position as they will already be covered by the ref genome + #I don't think this is correct \$16 == \$18 cuz we changed csv file format + #it only removes first line from csv + awk '\$16 == \$18 { print \$0 }' ${patient_vars} | tail -n +2 > ${patient_vars.baseName}_filt.csv + + ## copy the variant template to the workdir + cp /mnt/Avatar/dd/synthea/supporting-data/vcf/vcf_template.vcf . + + ## print the filtered variants to vcf format + printf "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNA00001" >> vcf_template.vcf + printf "\\n" >> vcf_template.vcf + awk -F',' '{print "chr"\$1,\$2,".",\$3,\$4,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print "chr"\$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + #awk -F',' '{print \$13,\$14,".",\$15,\$16,"100","PASS","NS=1","GT","0/1"}' OFS='\\t' ${patient_vars.baseName}_filt.csv >> vcf_template.vcf + + # sort the genome karyotypically (in chromosomal order) and index + #files are sorted already just for security + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar SortVcf -I vcf_template.vcf -O ${patient_vars.baseName}_sorted.vcf + #Liftover form 19 to 38 standard + java -jar /synthea/synthea/gatk-4.2.3.0/gatk-package-4.2.3.0-local.jar LiftoverVcf -I ${patient_vars.baseName}_sorted.vcf -O ${patient_vars.baseName}.vcf -C /mnt/Avatar/dd/synthea/supporting-data/ucsc-liftover/hg19ToHg38.over.chain.gz -REJECT rejected_variants.vcf -R /mnt/Avatar/dd/synthea/supporting-data/genome/hg38.fa + + #change this part to save on memory usage + #bgzip < ${patient_vars.baseName}.vcf > ${patient_vars.baseName}.vcf.gz + #gatk IndexFeatureFile -I ${patient_vars.baseName}.vcf.gz + #delete unnecessary files + rm vcf_template.vcf + rm ${patient_vars.baseName}_filt.csv + rm ${patient_vars.baseName}_sorted.vcf + rm rejected_variants.vcf + rm *vcf.idx + """ +} diff --git a/main_vcf2prot.nf b/main_vcf2prot.nf new file mode 100644 index 0000000..afbe508 --- /dev/null +++ b/main_vcf2prot.nf @@ -0,0 +1,65 @@ +nextflow.enable.dsl=2 + +process VCF2PROT { + memory 2.GB + + container "${params.container_vcf2prot}" + containerOptions "${params.containerOptions}" + // echo true + publishDir "${params.outdir}/vcf2prot", mode: 'copy' + stageInMode 'copy' + maxForks 1 + + input: + path vcf + path vcf_filtered + + output: + path "*.fasta" + + script: + """ + #!/bin/bash + workdir=`pwd` + patient_name=\$(basename $vcf .vcf) + + grep '^#' $vcf > work1.vcf + + /opt/conda/envs/vcf2prot/bin/python3 -c " +import pickle +import numpy as np +import pandas as pd + +with open('$vcf_filtered', 'rb') as fp: + vcf_file = pickle.load(fp) + +vcf_filtered=[] +for i in vcf_file: + vcf_filtered.extend(i[1]) + +vcf_filtered = pd.DataFrame(vcf_filtered) + +#drop duplicates, it returns error, consequence of overlapping genes on genome +vcf_filtered = vcf_filtered.drop_duplicates() + +vcf_filtered = vcf_filtered.replace({'chr':''}, regex=True) + +vcf_filtered = vcf_filtered.sort_values(by=[0,1]) + +with open('work1.vcf', 'a') as f: + for line in np.array(vcf_filtered): + li = str(line).replace('[', '').replace(']', '').replace('\\'', '').replace(' ', '\\t').replace('\\n', '') + f.write(f'{li}\\n') + " + + + awk '{gsub(/^##contig= \$workdir/work_2.vcf + ###remove non canonical chromososms + grep -v '[0-9]_\\|Y_\\|X_\\|Un_' \$workdir/work_2.vcf > \$workdir/work.vcf + ### + bcftools csq -f /home/omic/Homo_sapiens.GRCh38.dna.toplevel.fa -g /home/omic/Homo_sapiens.GRCh38.112.gff3 \$workdir/work.vcf --phase a --ncsq 120 -O v -o \$workdir/annotated.vcf + /home/omic/vcf2prot/bins/Linux/vcf2prot -f \$workdir/annotated.vcf -r /home/omic/vcf2prot/MANE_transcipts_reference.fasta -v -g st -o \$workdir + + mv *.fasta \${patient_name}_transcript_id_mutations.fasta + """ +} diff --git a/ncbiRefSeq_bigger.csv b/ncbiRefSeq_bigger.csv new file mode 100644 index 0000000..aaadd93 --- /dev/null +++ b/ncbiRefSeq_bigger.csv @@ -0,0 +1,289 @@ +,Unnamed: 0,bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames +2378,2378,9,XM_047416024.1,chr1,+,6785453,7769706,6785550,7766522,20,"6785453,6825091,7091303,7249490,7640399,7661725,7663352,7670910,7677598,7732447,7736343,7736930,7737254,7738297,7744834,7745844,7747709,7751198,7752458,7766458,","6785575,6825210,7091371,7249626,7640553,7661866,7665199,7671037,7677733,7732599,7736540,7737009,7737570,7738482,7745022,7746091,7747781,7751392,7752533,7769706,",0,CAMTA1,cmpl,cmpl,"0,1,0,2,0,1,1,0,1,1,0,2,0,1,0,2,0,0,2,2," +9028,9028,10,XM_005245795.6,chr1,+,13892823,15118043,13893635,15114635,17,"13892823,14180434,14598982,14960683,15034748,15043988,15056090,15060171,15063571,15065629,15094179,15094814,15101542,15103358,15104022,15112426,15114470,","13893756,14180592,14599223,14960875,15034885,15044159,15056280,15060302,15063622,15065753,15094385,15094933,15101774,15103460,15104189,15112541,15118043,",0,KAZN,cmpl,cmpl,"0,1,0,1,1,0,0,1,0,0,1,0,2,0,0,2,0," +4079,4079,1,NM_052896.5,chr1,-,33513997,34165842,33519517,34165813,70,"33513997,33519464,33519811,33521462,33523306,33524881,33527195,33533049,33533795,33537021,33537435,33540524,33541129,33542719,33546036,33550176,33567592,33569373,33571531,33572505,33577295,33580752,33583641,33586503,33587087,33600864,33602368,33605281,33605862,33611040,33614503,33616905,33617498,33622166,33623369,33624518,33625050,33626485,33633421,33635213,33636359,33646647,33652322,33657945,33662889,33692929,33698752,33700516,33709088,33714586,33716285,33724196,33724515,33725348,33726546,33739139,33743279,33772568,33788599,33792422,33810742,33819712,33820468,33825696,33846883,33918093,33935759,34032593,34088976,34165746,","33516570,33519677,33519950,33521550,33523419,33525043,33527258,33533229,33533907,33537095,33537609,33540698,33541309,33542896,33546219,33550350,33567766,33569547,33571726,33572691,33577484,33580899,33583830,33586617,33587168,33601010,33602546,33605470,33605988,33611250,33614620,33616975,33617617,33622271,33623466,33624643,33625254,33626581,33633535,33635330,33636554,33646835,33652461,33658137,33663092,33693056,33698944,33700673,33709258,33714775,33716501,33724313,33724704,33725536,33726685,33739334,33743606,33772751,33788712,33792526,33810864,33819837,33820556,33825774,33846996,33918301,33935954,34032706,34089193,34165842,",0,CSMD2,cmpl,cmpl,"-1,2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,2,1,1,2,1,1,0,1,1,1,1,1,2,1,1,1,1,2,0,1,2,1,1,2,1,1,2,1,0," +469,469,14,XM_017002595.3,chr1,-,48522510,50023954,48522580,50023796,13,"48522510,48539641,48587003,48590832,48634492,48653336,48663151,48867190,49045583,49245769,49697312,49851395,50023762,","48522692,48539738,48587166,48590985,48634604,48653451,48663241,48867230,49045800,49245864,49697437,49851518,50023954,",0,AGBL4,cmpl,cmpl,"2,1,0,0,2,1,1,0,2,0,1,1,0," +5886,5886,121,XM_047442743.1,chr1,-,50437027,50960267,50441439,50788105,20,"50437027,50475463,50490587,50491720,50535368,50539591,50567076,50582617,50583651,50584684,50596120,50655441,50705785,50738862,50744683,50787999,50801630,50857928,50959014,50959766,","50441523,50475679,50490665,50491801,50535457,50539728,50567231,50582699,50583715,50584811,50596216,50655528,50705891,50738954,50744775,50788205,50801677,50857997,50959111,50960267,",0,FAF1,cmpl,cmpl,"0,0,0,0,1,2,0,2,1,0,0,0,2,0,1,0,-1,-1,-1,-1," +4417,4417,15,NM_001379461.1,chr1,-,56994777,58546726,57010694,57291030,21,"56994777,57010679,57011144,57014882,57023530,57025980,57062889,57069359,57071022,57071521,57072282,57136542,57145289,57290963,57649591,57883998,58150510,58343351,58506059,58527260,58546702,","56998128,57010790,57011272,57015431,57023639,57026043,57062943,57069425,57071061,57071641,57072414,57136641,57145429,57291166,57649665,57884162,58150588,58343403,58506209,58527335,58546726,",0,DAB1,cmpl,cmpl,"-1,0,1,1,0,0,0,0,0,0,0,0,1,0,-1,-1,-1,-1,-1,-1,-1," +14956,14956,16,NM_001297440.2,chr1,+,65792509,66374579,65913351,66372678,16,"65792509,65913244,66247459,66257646,66257792,66266037,66332507,66355526,66361614,66363167,66363406,66365666,66367695,66367942,66368786,66372312,","65792630,65913356,66247654,66257683,66257863,66266087,66332620,66355620,66361793,66363266,66363571,66365766,66367850,66368065,66368969,66374574,",0,PDE4B,cmpl,cmpl,"-1,0,2,2,0,2,1,0,1,0,0,0,1,0,0,0," +11282,11282,139,NM_001366838.3,chr1,+,69567921,70144364,69568639,70121887,26,"69567921,69678380,69760190,69792042,69825747,69834779,69838226,69931506,69980378,69986241,69994560,70011796,70012973,70016464,70018718,70021004,70023125,70028170,70036120,70036443,70038112,70053025,70076076,70089726,70107751,70121779,","69568641,69678478,69760393,69792160,69825826,69834869,69838283,69931570,69980453,69986386,69994633,70011926,70013089,70016534,70018818,70021129,70023374,70028371,70036232,70036624,70039793,70053145,70076298,70089819,70107826,70144364,",0,LRRC7,cmpl,cmpl,"0,2,1,0,1,2,2,2,0,0,1,2,0,2,0,1,0,0,0,1,2,0,0,0,0,0," +13105,13105,141,NM_173808.3,chr1,-,71395942,72282539,71407445,72282494,7,"71395942,71592816,71611025,71698007,71776171,71935078,72282318,","71407570,71592968,71611146,71698139,71776297,71935311,72282539,",0,NEGR1,cmpl,cmpl,"1,2,1,1,1,2,0," +19473,19473,18,NM_001349109.3,chr1,+,76074745,76637339,76074866,76634191,4,"76074745,76313804,76412007,76634127,","76074884,76313999,76412417,76637339,",0,ST6GALNAC3,cmpl,cmpl,"0,0,0,2," +387,387,18,NM_001393349.1,chr1,+,81306131,81993932,81836984,81991145,26,"81306131,81445020,81580875,81761810,81836884,81907016,81936727,81942033,81942968,81950188,81951017,81951956,81952986,81955876,81966057,81966403,81968025,81969177,81970313,81971851,81979868,81981807,81984582,81985258,81986900,81990390,","81306509,81445089,81580980,81761852,81837057,81907230,81936837,81942045,81943769,81950482,81951121,81952142,81953025,81956060,81966183,81966609,81968199,81969387,81970534,81971918,81979960,81981976,81984711,81985355,81987029,81993932,",0,ADGRL2,cmpl,cmpl,"-1,-1,-1,-1,0,1,2,1,1,1,1,0,0,0,1,1,0,0,0,2,0,2,0,0,1,1," +5084,5084,20,XM_005270562.3,chr1,-,97077742,97921059,97078975,97920922,22,"97077742,97082329,97098488,97193068,97234851,97305258,97306176,97373560,97382392,97450058,97549559,97573759,97593217,97595058,97679094,97691716,97699350,97721509,97740391,97828113,97883263,97920883,","97079146,97082470,97098632,97193248,97234994,97305378,97306297,97373644,97382461,97450223,97549744,97573970,97593387,97595166,97679182,97691798,97699547,97721671,97740479,97828196,97883374,97921059,",0,DPYD,cmpl,cmpl,"0,0,0,0,1,1,0,0,0,0,1,0,1,1,0,2,0,0,2,0,0,0," +5008,5008,29,XM_017000984.2,chr1,+,171841497,172418466,171841656,172418317,20,"171841497,171921747,171987655,171988944,172032401,172033104,172038318,172042008,172044384,172048611,172068818,172081831,172092823,172131174,172253572,172308727,172379017,172387132,172388572,172418280,","171841817,171921821,171987805,171989148,172032500,172033265,172038461,172042144,172044452,172048750,172068905,172081902,172092875,172131288,172253682,172308839,172379182,172387359,172388809,172418466,",0,DNM3,cmpl,cmpl,"0,2,1,1,1,1,0,2,0,2,0,0,2,0,0,2,0,0,2,2," +16657,16657,239,XM_047436039.1,chr1,+,174159519,174995308,174219157,174990001,25,"174159519,174219124,174220971,174231144,174241482,174250474,174252479,174272413,174275832,174278612,174304985,174370978,174393994,174637374,174683521,174699524,174702112,174752312,174811831,174957456,174969276,174976077,174978806,174982833,174989848,","174159657,174219184,174221164,174231355,174241657,174250632,174252590,174272480,174275935,174278779,174305127,174371072,174394145,174637488,174683596,174699650,174702256,174752354,174811960,174957549,174969387,174976182,174978890,174982905,174995308,",0,RABGAP1L,cmpl,cmpl,"-1,0,0,1,2,0,2,2,0,1,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0," +21862,21862,34,NM_206933.4,chr1,-,215622890,216423448,215625780,216422336,72,"215622890,215628813,215634458,215639154,215640557,215647521,215648527,215650591,215670971,215674099,215680148,215728029,215741374,215743176,215758594,215759659,215766680,215779842,215782041,215782737,215786669,215790058,215798906,215813735,215816996,215837990,215844293,215845823,215867006,215877757,215878763,215888425,215900074,215900754,215934615,215965316,215970624,215993019,215998886,216000402,216046430,216048533,216070100,216072888,216073096,216078088,216083455,216084697,216086718,216089012,216097082,216175251,216190222,216196552,216198314,216199626,216207272,216217386,216231952,216246584,216250902,216289279,216292174,216321882,216323473,216324167,216325304,216327590,216364952,216418513,216421851,216423213,","215625870,215629035,215634703,215639238,215640734,215647730,215648766,215650801,215671293,215675616,215680376,215728384,215741537,215743335,215758752,215759843,215766788,215780041,215782196,215782935,215786874,215790282,215799125,215813904,215817195,215838103,215844496,215846033,215867170,215877880,215879098,215889054,215900217,215900905,215934795,215965479,215970776,215993167,215999058,216000562,216046592,216048647,216070292,216072969,216073300,216078362,216083586,216084877,216086820,216089139,216097213,216175482,216190367,216196722,216198584,216200121,216207431,216217550,216232136,216247226,216251098,216289410,216292370,216321976,216323695,216324352,216325599,216327654,216365085,216418679,216422540,216423448,",0,USH2A,cmpl,cmpl,"0,0,1,1,1,2,0,0,2,0,0,2,1,1,2,1,1,0,1,1,0,1,1,0,2,0,1,1,2,2,0,1,2,1,1,0,1,0,2,1,1,1,1,1,1,0,1,1,1,0,1,1,0,1,1,1,1,2,1,1,0,1,0,2,2,0,2,1,0,2,0,-1," +5754,5754,34,XM_017000625.2,chr1,-,216503245,217137702,216506938,216905339,9,"216503245,216519151,216564218,216567987,216650972,216677075,216905247,216939581,217137666,","216507183,216519421,216564401,216568098,216651089,216677491,216905683,216939673,217137702,",0,ESRRG,cmpl,cmpl,"1,1,1,1,1,2,0,-1,-1," +17643,17643,299,XM_017002028.2,chr1,+,237042183,237833988,237042521,237832647,106,"237042183,237270496,237330877,237364357,237369533,237374716,237377322,237387280,237388086,237417048,237423091,237441318,237445400,237454390,237456599,237469091,237491805,237492953,237496510,237500710,237503288,237506709,237511687,237530426,237548430,237550543,237566566,237569144,237589792,237590639,237591738,237593475,237594099,237595497,237602024,237610761,237614038,237617285,237623764,237625660,237627806,237631426,237633577,237634888,237638356,237639014,237640896,237643326,237648443,237649876,237651410,237654273,237655820,237657943,237659984,237660809,237666511,237667882,237674095,237674730,237678047,237680455,237687454,237698964,237700228,237701977,237705212,237706948,237708857,237709479,237711744,237717197,237718461,237723127,237726272,237727086,237730259,237732045,237733704,237742295,237743599,237756287,237757696,237759775,237760954,237770806,237772011,237773519,237778665,237781564,237783674,237785968,237787987,237791428,237792104,237793866,237795288,237798036,237801855,237806136,237808900,237819035,237828380,237830529,237831513,237832551,","237042569,237270616,237330982,237364372,237369608,237374795,237377435,237387380,237388183,237417123,237423248,237441483,237445522,237454574,237456735,237469187,237491924,237493087,237496752,237500903,237503505,237506814,237511791,237530510,237548590,237550691,237566775,237569319,237590001,237590992,237591853,237593636,237594129,237595657,237602111,237610988,237614843,237617486,237623870,237625804,237628080,237631541,237633710,237634992,237638492,237639201,237641002,237643447,237648613,237650097,237651501,237654414,237655984,237658022,237660074,237660947,237666589,237667958,237674219,237674846,237678112,237680577,237687504,237699025,237700467,237702059,237705343,237707269,237709098,237709567,237711837,237717368,237718521,237723262,237726308,237727199,237730356,237732149,237733756,237742349,237743623,237756387,237757776,237759852,237761028,237770887,237772100,237773648,237778770,237781646,237784972,237786036,237788135,237791515,237792323,237793997,237795331,237798170,237801916,237806283,237809035,237819192,237828445,237830630,237831565,237833988,",0,RYR2,cmpl,cmpl,"0,0,0,0,0,0,1,0,1,2,2,0,0,2,0,1,1,0,2,1,2,0,0,2,2,0,1,0,1,0,2,0,2,2,0,0,2,0,0,1,1,2,0,1,0,1,2,0,1,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,2,1,2,1,1,2,0,0,0,0,0,0,2,0,2,0,0,0,1,0,2,1,1,0,0,0,1,0,2,0,0,0,2,0,2,0,0,0,1,0,2,0," +3358,3358,301,XM_047443244.1,chr1,+,239386567,239915450,239907451,239909224,4,"239386567,239545640,239678185,239907432,","239387227,239545749,239678288,239915450,",0,CHRM3,cmpl,cmpl,"-1,-1,-1,0," +17061,17061,37,NM_001374808.1,chr1,-,240774741,241357230,240800646,241098762,19,"240774741,240800640,240802903,240806139,240811917,240813617,240814715,240816316,240827097,240868586,240868775,240870054,240930716,240936599,240983078,241098665,241101832,241355698,241356898,","240776213,240800721,240802993,240806326,240812043,240813728,240814777,240816415,240827172,240868668,240868852,240870119,240930768,240936706,240983129,241098762,241101978,241355826,241357205,",0,RGS7,cmpl,cmpl,"-1,0,0,2,2,2,0,0,0,2,0,1,0,1,1,0,-1,-1,-1," +9259,9259,38,NM_018012.4,chr1,+,245154984,245709432,245155424,245702606,15,"245154984,245156281,245366833,245419578,245540766,245602576,245607650,245609265,245611792,245646120,245684232,245685404,245698105,245698886,245702457,","245155487,245156683,245367367,245419745,245540950,245602783,245607744,245609528,245611976,245646280,245684395,245688807,245698308,245699037,245709432,",0,KIF26B,cmpl,cmpl,"0,0,0,0,2,0,0,1,0,1,2,0,1,0,1," +18940,18940,38,NM_001167740.2,chr1,-,245749346,246507279,245749562,246507217,12,"245749346,245764040,245858495,245863798,245915529,245927930,245929869,246327200,246330479,246335366,246355030,246507053,","245749664,245764149,245858670,245863886,245915640,245928033,245929937,246327337,246330537,246335474,246355094,246507279,",0,SMYD3,cmpl,cmpl,"0,2,1,0,0,2,0,1,0,0,2,0," +337,337,74,NM_018702.4,chr10,-,1177312,1737525,1183192,1737150,10,"1177312,1184860,1199965,1216950,1233693,1242130,1270954,1363027,1379073,1737050,","1183369,1185039,1200147,1217119,1233845,1242299,1271069,1363917,1379160,1737525,",0,ADARB2,cmpl,cmpl,"0,1,2,1,2,1,0,1,1,0," +3080,3080,10,NM_001326317.2,chr10,+,10462549,11336675,11165504,11329053,15,"10462549,10682455,10919963,11165485,11217424,11249152,11257737,11266597,11270665,11275056,11288417,11314138,11321206,11325835,11328925,","10462586,10682494,10919999,11165682,11217507,11249201,11257872,11266677,11270824,11275120,11288552,11314258,11321386,11325979,11336675,",0,CELF2,cmpl,cmpl,"-1,-1,-1,0,1,0,1,1,0,0,1,1,1,1,1," +6571,6571,86,NM_018027.5,chr10,-,13643705,14330924,13651904,14330102,25,"13643705,13651902,13654415,13656635,13659322,13660315,13663452,13666096,13670405,13674910,13693897,13701339,13707036,13737843,13740193,13740511,13747735,13761646,13762623,13782921,13796495,13810813,13858846,14330057,14330596,","13647035,13651974,13654512,13657522,13659490,13660553,13663509,13666325,13670528,13675044,13694039,13701478,13707113,13737930,13740251,13740577,13747819,13761669,13762680,13783006,13796588,13810908,13858912,14330183,14330924,",0,FRMD4A,cmpl,cmpl,"-1,2,1,2,2,1,1,0,0,1,0,2,0,0,2,2,2,0,0,2,2,0,0,0,-1," +11589,11589,91,XM_011519454.2,chr10,+,19046926,19734478,19048959,19734237,41,"19046926,19048958,19066718,19087839,19088023,19103978,19123491,19124523,19128220,19133855,19136573,19146197,19155074,19165636,19175207,19203727,19204307,19204897,19209267,19257683,19280046,19283018,19323948,19327562,19331368,19347770,19352005,19387527,19389451,19450306,19491516,19498484,19531193,19567501,19595193,19607776,19615856,19692281,19692457,19730705,19734156,","19046982,19049137,19066859,19087934,19088185,19104075,19123593,19124670,19128387,19133948,19136781,19146344,19155172,19165810,19175328,19203880,19204413,19205265,19209680,19257771,19280223,19283181,19324105,19327673,19331582,19348018,19352297,19387773,19389609,19450490,19491645,19498646,19531351,19567703,19595457,19607902,19615923,19692361,19692554,19730781,19734478,",0,MALRD1,cmpl,cmpl,"-1,0,1,1,0,0,1,1,1,0,0,1,1,0,0,1,1,2,1,0,1,1,2,0,0,1,0,1,1,0,1,1,1,0,1,1,1,2,1,2,0," +9222,9222,11,NM_001098500.3,chr10,+,23694726,24547843,24219795,24546324,19,"23694726,24007224,24219625,24380868,24432994,24438385,24473227,24494499,24495146,24501378,24513258,24520122,24521781,24524322,24527935,24531829,24533069,24544980,24545826,","23695234,24007374,24219909,24381067,24433193,24438479,24474060,24494604,24495196,24501545,24513434,24520253,24521929,24524764,24528119,24531993,24533237,24545103,24547843,",0,KIAA1217,cmpl,cmpl,"-1,-1,0,0,1,2,0,2,2,1,0,2,1,2,0,1,0,0,0," +14702,14702,13,NM_019619.4,chr10,-,34109560,34815296,34111168,34814995,25,"34109560,34119612,34131462,34269656,34284134,34317106,34331116,34336198,34337274,34341626,34347955,34359146,34360070,34372497,34374873,34377966,34382539,34384128,34399329,34401825,34450316,34470084,34516978,34696317,34814875,","34111562,34119740,34131583,34269899,34284245,34317338,34331344,34336243,34337426,34341816,34348115,34359317,34360259,34372536,34375002,34378106,34382922,34384254,34399413,34401917,34450448,34470263,34517159,34696419,34815296,",0,PARD3,cmpl,cmpl,"2,0,2,2,2,1,1,1,2,1,0,0,0,0,0,1,2,2,2,0,0,1,0,0,0," +860,860,118,XM_006717951.4,chr10,-,47467992,47991796,47468846,47991772,12,"47467992,47471261,47472132,47473969,47474304,47474944,47475492,47476231,47477080,47478532,47479797,47991679,","47468906,47471384,47472191,47474065,47474398,47475004,47475572,47476322,47477194,47478627,47479888,47991796,",0,ANXA8,cmpl,cmpl,"0,0,1,1,0,0,1,0,0,1,0,0," +16070,16070,15,NM_001098512.3,chr10,+,50990887,52298350,50991378,52293900,18,"50990887,51153163,51467722,51804584,51907506,52054483,52062536,52133839,52161888,52251569,52271349,52272391,52280788,52282152,52288725,52288930,52290223,52293801,","50991644,51153330,51467836,51804690,51907570,52054561,52062631,52133905,52161963,52251666,52271489,52272481,52280930,52282316,52288848,52288993,52290290,52298350,",0,PRKG1,cmpl,cmpl,"0,2,1,1,2,0,0,2,2,2,0,2,2,0,2,2,2,0," +14788,14788,15,NM_001354404.2,chr10,-,53802770,55627942,53821857,54664262,35,"53821098,53827392,53828564,53831314,53840319,53857174,53866641,53903242,53938814,53940865,53959731,53961751,53995648,54020191,54022891,54066756,54079330,54089983,54132874,54153099,54183443,54185133,54195682,54213935,54236822,54317270,54329595,54346364,54369119,54378781,54664171,54897449,55166575,55627624,55627864,","53823358,53827548,53828573,53831533,53840496,53857263,53866857,53903370,53938955,53940975,53959844,53961892,53995765,54020416,54023197,54066885,54079424,54090063,54133007,54153293,54183593,54185268,54195889,54214048,54236931,54317441,54329706,54346484,54369275,54378942,54664290,54897500,55166651,55627753,55627942,",0,PCDH15,cmpl,cmpl,"2,2,2,2,2,0,0,1,1,2,0,0,0,0,0,0,2,0,2,0,0,0,0,1,0,0,0,0,0,1,0,-1,-1,-1,-1," +743,743,130,NM_001204403.2,chr10,-,60026297,60733528,60042690,60733319,44,"60026297,60042671,60055657,60059339,60059697,60063110,60064156,60067934,60080536,60082614,60083491,60084601,60085156,60086676,60088146,60105904,60108829,60114224,60134270,60138963,60166590,60166823,60172307,60172899,60173087,60181328,60186714,60196144,60196526,60198339,60200128,60203001,60205791,60208035,60213411,60234687,60261858,60263834,60270130,60278773,60279049,60279537,60615185,60733262,","60029826,60042759,60056036,60059430,60060006,60063254,60064288,60068009,60080618,60082737,60083617,60084830,60085253,60086884,60088358,60106059,60109054,60114331,60134373,60139087,60166653,60166896,60172403,60172998,60173186,60181427,60186912,60196243,60196625,60198537,60200227,60203100,60205890,60208233,60213510,60234786,60261957,60264020,60270229,60278872,60279148,60279639,60615224,60733528,",0,ANK3,cmpl,cmpl,"-1,0,2,1,1,1,1,1,0,0,0,2,1,0,1,2,2,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0," +4199,4199,0,XM_047425124.1,chr10,-,65912522,67763594,65920329,67648770,19,"65912522,65966611,65988691,66069307,66103156,66280469,66379151,66520616,66621691,66766263,66775443,67180316,67219606,67521841,67539502,67606856,67647414,67648739,67763433,","65920617,65966746,65988797,66069489,66103249,66280621,66379352,66520773,66621784,66766416,66775524,67180520,67219870,67521961,67539669,67607049,67647518,67648816,67763594,",0,CTNNA3,cmpl,cmpl,"0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,-1," +408,408,17,XM_017015705.2,chr10,+,74151220,74709290,74151278,74708356,9,"74151220,74200763,74224537,74314666,74394140,74398470,74589281,74670182,74708320,","74151343,74200838,74224591,74314745,74394313,74398579,74589317,74670269,74709290,",0,ADK,cmpl,cmpl,"0,2,2,2,0,2,0,0,0," +11205,11205,2,NM_001305581.2,chr10,+,75431623,76560168,75431724,76557288,7,"75431623,75438393,76036007,76047163,76058665,76324400,76557208,","75431754,75438494,76036134,76047303,76058783,76324485,76560168,",0,LRMDA,cmpl,cmpl,"0,0,2,0,2,0,1," +9117,9117,18,XM_017016207.3,chr10,-,76869601,77637808,76877868,77637642,32,"76869601,76887290,76889450,76891524,76909965,76914056,76914935,76944772,76949141,76953800,76969973,77001406,77011966,77019012,77025445,77027822,77039527,77073096,77079480,77084636,77086487,77090399,77108480,77110172,77112366,77120972,77183420,77184822,77251194,77403861,77493797,77637264,","76877893,76887515,76889569,76891719,76910096,76914137,76915049,76944965,76949366,76953924,76970067,77001580,77012043,77019099,77025457,77027891,77039637,77073252,77079550,77084719,77086593,77090510,77108572,77110343,77112442,77121048,77183532,77184916,77251256,77404023,77493929,77637808,",0,KCNMA1,cmpl,cmpl,"2,2,0,0,1,1,1,0,0,2,1,1,2,2,2,2,0,0,2,0,2,2,0,0,2,1,0,2,0,0,0,0," +13578,13578,18,XM_047424512.1,chr10,+,81875193,82987179,81875340,82985605,9,"81875193,82358738,82738576,82951468,82958948,82973787,82978949,82984782,82985097,","81876163,82358868,82738650,82951571,82959075,82973915,82979120,82984854,82987179,",0,NRG3,cmpl,cmpl,"0,1,2,1,2,0,2,2,2," +7470,7470,19,NM_017551.3,chr10,-,85599551,86366795,85602272,86366392,16,"85599551,85613406,85619866,85647201,85723002,85724351,85727854,85729512,85854495,85856028,85869009,85916185,86138818,86206363,86363940,86366313,","85602701,85613647,85620033,85647397,85723141,85724676,85728052,85729614,85854615,85856190,85869180,85916239,86139024,86206648,86364096,86366795,",0,GRID1,cmpl,cmpl,"0,2,0,2,1,0,0,0,0,0,0,0,1,1,1,0," +8254,8254,20,XM_047425614.1,chr10,-,98457076,99315951,98459573,99144331,12,"98457076,98482635,98490050,98614903,98620601,98641846,98693899,98721656,98743882,99144237,99232347,99315928,","98459739,98482782,98490196,98615018,98620708,98641940,98693947,98721828,98744056,99144399,99232505,99315951,",0,HPSE2,cmpl,cmpl,"2,2,0,2,0,2,2,1,1,0,-1,-1," +19053,19053,21,NM_014978.3,chr10,+,104641289,105265242,104641327,105263374,27,"104641289,104842791,104915832,104977334,105043054,105089774,105105396,105139396,105147616,105157137,105158891,105164302,105167257,105178065,105199998,105201119,105211136,105214441,105216935,105223115,105245541,105247218,105252774,105255701,105256818,105262330,105263309,","104641954,104842859,104915932,104977493,105043128,105089839,105105515,105139486,105147796,105157284,105158994,105164379,105167349,105178173,105200116,105201253,105211250,105214613,105217122,105223249,105245665,105247331,105252906,105255801,105256924,105262491,105265242,",0,SORCS3,cmpl,cmpl,"0,0,2,0,0,2,1,0,0,0,0,1,0,2,2,0,2,2,0,1,0,1,0,0,1,2,1," +19051,19051,21,XM_017015617.1,chr10,-,106573662,107181138,106574101,107181066,27,"106573662,106579090,106579368,106597350,106607165,106611910,106618148,106620427,106629201,106652381,106667688,106671236,106672867,106675048,106677312,106679255,106679631,106688191,106699213,106706544,106709222,106730049,106761587,106776533,106829573,106956512,107181063,","106574181,106579251,106579474,106597450,106607297,106612023,106618272,106620561,106629388,106652553,106667802,106671367,106672985,106675156,106677404,106679332,106679734,106688338,106699393,106706634,106709341,106730114,106761661,106776692,106829673,106956580,107181138,",0,SORCS1,cmpl,cmpl,"1,2,1,0,0,1,0,1,0,2,2,0,2,2,0,1,0,0,0,0,1,2,0,0,2,0,0," +1465,1465,22,XM_011539588.3,chr10,+,115093364,115948999,115093544,115944779,29,"115093364,115120184,115121698,115127592,115129326,115160039,115165557,115171036,115215696,115241570,115265192,115266796,115268325,115281354,115286215,115300033,115301854,115315517,115334281,115394658,115426249,115461940,115467173,115469171,115519262,115549457,115727247,115847876,115944657,","115093636,115120268,115121812,115127721,115129535,115160214,115165645,115171292,115215880,115241725,115265277,115267005,115268444,115281487,115286397,115300247,115302043,115315736,115334419,115394752,115426302,115462035,115467252,115469329,115519324,115549536,115727355,115847991,115948999,",0,ATRNL1,cmpl,cmpl,"0,2,2,2,2,1,2,0,1,2,1,2,1,0,1,0,1,1,1,1,2,1,0,1,0,2,0,0,1," +5022,5022,194,NM_001377548.1,chr10,+,126905427,127452516,126905517,127451427,51,"126905427,126977947,126981917,126987520,126990454,126996747,126998091,126999353,127000171,127008731,127012294,127018709,127023199,127024684,127026351,127031649,127032136,127037718,127042624,127043063,127052680,127061667,127106230,127110247,127125473,127127668,127248007,127257334,127339005,127343645,127354668,127362063,127373780,127374057,127380081,127381277,127384789,127403054,127404324,127409036,127409312,127410839,127415151,127418364,127419665,127425873,127433282,127439026,127444125,127447393,127451331,","126905563,126977988,126981973,126987617,126990603,126996883,126998249,126999435,127000307,127008804,127012374,127018835,127023324,127024783,127026424,127031753,127032320,127037816,127042714,127043164,127052815,127061776,127106301,127110354,127125601,127127764,127248109,127257429,127339084,127343746,127354727,127362212,127373866,127374214,127380122,127381368,127384909,127403144,127404429,127409178,127409391,127410924,127415238,127418541,127419749,127426011,127433428,127439225,127444279,127447545,127452516,",0,DOCK1,cmpl,cmpl,"0,1,0,2,0,2,0,2,0,1,2,1,1,0,0,1,0,1,0,0,2,2,0,2,1,0,0,0,2,0,2,1,0,2,0,2,0,0,0,0,1,2,0,0,0,0,0,2,0,1,0," +17755,17755,82,NM_001424318.1,chr11,-,9778667,10303837,9780417,10300496,41,"9778667,9781506,9784350,9785124,9787633,9789108,9790555,9795830,9807999,9808900,9812531,9816839,9829355,9832223,9839497,9842624,9845564,9846955,9850022,9852675,9853539,9856457,9858225,9895942,9961956,9963772,9968340,9989496,9992414,9992989,9993920,9998265,10000913,10002556,10028451,10029764,10031047,10042843,10193901,10300405,10303295,","9780516,9781638,9784438,9785318,9787738,9789342,9790683,9795957,9808185,9809002,9812708,9817024,9829496,9832420,9839696,9842770,9845740,9847083,9850218,9852749,9853712,9856720,9858396,9896011,9962106,9963882,9968545,9989595,9992543,9993103,9993998,9998379,10001022,10002689,10028557,10029875,10031170,10042981,10193987,10300498,10303837,",0,SBF2,cmpl,cmpl,"0,0,2,0,0,0,1,0,0,0,0,1,1,2,1,2,0,1,0,1,2,0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,1,0,-1," +19080,19080,88,NM_001367872.1,chr11,-,15966448,16738477,15972808,16341248,17,"15966448,15986203,15988996,16014941,16046513,16049754,16055751,16097608,16111802,16183885,16186782,16234581,16318445,16341011,16612080,16714829,16738424,","15973112,15986420,15989230,16015050,16046701,16049938,16055901,16097688,16111923,16183954,16186955,16234671,16318653,16341252,16612260,16714905,16738477,",0,SOX6,cmpl,cmpl,"2,1,1,0,1,0,0,1,0,0,1,1,0,0,-1,-1,-1," +12885,12885,11,XM_047427831.1,chr11,+,19345235,20121601,19350952,20118258,37,"19345235,19350372,19832483,19842870,19868924,19879868,19892433,19933175,19939660,19946400,19948690,19984124,20035958,20043980,20044967,20048727,20054079,20055768,20062306,20068185,20068323,20077551,20077992,20080063,20083006,20090864,20092205,20093098,20095671,20097576,20100936,20103254,20103652,20105530,20107663,20114591,20118132,","19345311,19351027,19832601,19842923,19868997,19880127,19892594,19934277,19939773,19946509,19949080,19984247,20036097,20044272,20045670,20049195,20054240,20055957,20062359,20068209,20068398,20077635,20078104,20080209,20083179,20091018,20092368,20093199,20095767,20097745,20101172,20103409,20103724,20105727,20107782,20114795,20121601,",0,NAV2,cmpl,cmpl,"-1,0,0,1,0,1,2,1,2,1,2,2,2,0,1,2,2,1,1,0,0,0,0,1,0,2,0,1,0,0,1,0,2,2,1,0,0," +13130,13130,11,NM_001288714.1,chr11,+,20669550,21575686,20669723,21575022,19,"20669550,20677931,20783679,20885443,20918181,20919251,20927307,20928376,20937785,20947335,20960431,21113588,21229331,21370852,21534373,21560188,21570763,21573184,21574971,","20669778,20678060,20783830,20885540,20918254,20919334,20927442,20928479,20937859,20947435,20960560,21113714,21229454,21370948,21534514,21560382,21570940,21573409,21575686,",0,NELL1,cmpl,cmpl,"0,1,1,2,0,1,0,0,1,0,1,1,1,1,1,1,0,0,0," +11397,11397,96,NM_001252008.2,chr11,+,24497052,25082638,24738227,25078658,12,"24497052,24729172,24732117,24738220,24763245,24905990,24914475,24976590,24983125,25050037,25077328,25078553,","24497305,24729286,24732188,24738302,24763308,24906053,24914538,24976665,24983293,25050130,25077406,25082638,",0,LUZP2,cmpl,cmpl,"-1,-1,-1,0,0,0,0,0,0,0,0,0," +11268,11268,13,XM_047427350.1,chr11,-,40114198,41459652,40114369,40116292,10,"40114198,40140800,40241518,40319627,40648141,40933634,41222796,41330938,41346263,41459430,","40116334,40140853,40241598,40319721,40648278,40933723,41223008,41345575,41346346,41459652,",0,LRRC4C,cmpl,cmpl,"0,-1,-1,-1,-1,-1,-1,-1,-1,-1," +18207,18207,140,XM_047426616.1,chr11,-,70467853,71253228,70472868,71147326,23,"70467853,70485313,70490275,70492334,70502205,70502795,70659827,70661595,70698687,70798442,70807001,70820363,70896500,71056480,71075158,71092421,71094536,71109940,71113292,71118828,71147119,71224696,71253034,","70473439,70487720,70490387,70492465,70502286,70502931,70659952,70661678,70698763,70798556,70807171,70820682,70896567,71056558,71075275,71092589,71094688,71110049,71113364,71119032,71147338,71224796,71253228,",0,SHANK2,cmpl,cmpl,"2,1,0,1,1,0,1,2,1,1,2,1,0,0,0,0,1,0,0,0,0,-1,-1," +20214,20214,148,NM_001098816.3,chr11,-,78652828,79441030,78658057,79069944,34,"78652828,78661448,78668936,78672032,78676151,78688053,78701525,78708360,78712481,78720369,78722667,78726078,78729375,78732315,78738450,78756804,78770991,78778601,78786897,78805291,78812121,78814293,78854103,78855963,78862961,78889784,78891237,78903267,79064737,79069721,79148709,79215807,79297487,79440508,","78658816,78661591,78670551,78672329,78676387,78688226,78702403,78708515,78712714,78720390,78722917,78726222,78729643,78732577,78738570,78757021,78771138,78778628,78787083,78805492,78812316,78814395,78854314,78856178,78863132,78890020,78891336,78903523,79065007,79070009,79148806,79215909,79297543,79441030,",0,TENM4,cmpl,cmpl,"0,1,0,0,1,2,0,1,2,2,1,1,0,2,2,1,1,1,1,1,1,1,0,1,1,2,2,1,1,0,-1,-1,-1,-1," +4861,4861,2,XM_047426495.1,chr11,-,83455011,85628373,83459817,85626617,28,"83455011,83462001,83466707,83469200,83471625,83472726,83484128,83532707,83541681,83633210,83786689,83833613,83874419,83930327,83962884,83965323,83980505,84059314,84098922,84163460,84251237,84534569,85111660,85154555,85285219,85308984,85598656,85626598,","83459924,83462093,83466817,83469373,83471727,83472777,83484228,83532783,83541858,83633325,83786792,83833770,83874488,83930483,83963023,83965468,83980642,84059484,84099047,84163511,84251291,84534731,85111735,85154651,85285365,85309023,85598788,85628373,",0,DLG2,cmpl,cmpl,"1,2,0,1,1,1,0,2,2,1,0,2,2,2,1,0,1,2,0,0,0,0,0,0,1,1,1,0," +7504,7504,157,NM_001143831.3,chr11,-,88504641,89065982,88508591,89047872,10,"88504641,88525308,88567052,88590600,88597183,88604717,88653167,88849905,89047211,89065775,","88509504,88525404,88567992,88590727,88597352,88604964,88653403,88850155,89048072,89065982,",0,GRM5,cmpl,cmpl,"2,2,1,0,2,1,2,1,0,-1," +6146,6146,2,XM_017017186.2,chr11,+,92224817,92896473,92524671,92891113,28,"92224817,92524633,92697383,92761855,92764878,92765506,92774040,92789942,92792766,92797835,92805152,92806361,92809842,92831621,92834869,92836565,92837662,92840561,92843933,92857213,92859164,92866740,92880730,92882737,92886999,92889188,92889855,92890490,","92225174,92524948,92697445,92762170,92765089,92765575,92774180,92790218,92792977,92801909,92805349,92806515,92810076,92832011,92835084,92836703,92837806,92840759,92844732,92857348,92859322,92867209,92880884,92883393,92887113,92889248,92889891,92896473,",0,FAT3,cmpl,cmpl,"-1,0,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,1,1,0,1,2,1,1,1,1," +3704,3704,20,XM_047427130.1,chr11,+,99020948,100358885,99556214,100356220,23,"99020948,99556144,99844851,99845086,99916053,99956805,100002033,100061211,100070423,100071704,100074143,100191125,100193487,100224691,100255759,100271091,100297624,100299161,100308358,100340462,100341092,100350701,100356116,","99021270,99556269,99844975,99845262,99916149,99957009,100002136,100061393,100070560,100071834,100074294,100191253,100193663,100224812,100255918,100271241,100297695,100299396,100308468,100340649,100341205,100350870,100358885,",0,CNTN5,cmpl,cmpl,"-1,0,1,2,1,1,1,2,1,0,1,2,1,0,1,1,1,0,1,0,1,0,1," +9625,9625,24,XM_011543027.3,chr11,-,126423357,127000870,126424579,127000509,17,"126423357,126425637,126429178,126431343,126435267,126436810,126440448,126444978,126446758,126449008,126456348,126463156,126473308,126521314,126526537,126562834,127000454,","126425023,126425724,126429288,126431526,126435303,126437009,126440549,126445105,126446886,126449157,126456454,126463307,126473466,126521464,126526687,126562912,127000870,",0,KIRREL3,cmpl,cmpl,"0,0,1,1,1,0,1,0,1,2,1,0,1,1,1,1,0," +13653,13653,24,NM_001048209.2,chr11,+,131370614,132336822,131370806,132335146,8,"131370614,131911563,132146281,132212021,132307688,132310111,132314551,132335045,","131370888,131911648,132146514,132212147,132307823,132310232,132314703,132336822,",0,NTM,cmpl,cmpl,"0,1,2,1,1,1,2,1," +13870,13870,24,NM_001012393.5,chr11,-,132414980,133532501,132420192,133532324,8,"132414980,132436085,132436658,132437221,132529060,132657086,132942925,133532263,","132420293,132436237,132436779,132437359,132529186,132657319,132943010,133532501,",0,OPCML,cmpl,cmpl,"1,2,1,1,1,2,1,0," +2293,2293,9,XM_017019953.2,chr12,+,1970779,2697950,1971062,2691199,47,"1970779,2115223,2120324,2448975,2457566,2486103,2493189,2504435,2512811,2549942,2556950,2566421,2567568,2581589,2582821,2584502,2585375,2585834,2593212,2595873,2597436,2601853,2605080,2605678,2606610,2606983,2608510,2610540,2611902,2633628,2648474,2651639,2653834,2655146,2664824,2665580,2666685,2668932,2674540,2677093,2677732,2679443,2682549,2685735,2686165,2688446,2690899,","1971201,2115545,2120430,2449115,2457706,2486262,2493386,2504539,2512984,2550033,2556977,2566582,2567794,2581797,2582942,2584617,2585496,2585904,2593345,2596003,2597496,2601960,2605168,2605786,2606663,2607130,2608712,2610699,2612013,2633712,2648507,2651768,2653900,2655238,2664990,2665708,2666782,2669035,2674642,2677221,2677867,2679796,2682678,2685842,2686269,2688779,2697950,",0,CACNA1C,cmpl,cmpl,"0,1,2,0,2,1,1,0,2,1,2,2,1,2,0,1,2,0,1,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,2,0,2,0,1,1,0,0,2,2,1,0,0," +19079,19079,11,XM_047429457.1,chr12,-,23529503,24562650,23534218,23896023,21,"23529503,23536452,23543210,23546315,23563257,23575660,23604386,23640811,23665443,23734683,23740866,23755637,23845982,23895792,24213342,24277215,24331276,24368562,24407484,24460595,24562337,","23534522,23536669,23543384,23546424,23563403,23575838,23604533,23640897,23665567,23734752,23741039,23755724,23846193,23896024,24213417,24277312,24331396,24368639,24407566,24460776,24562650,",0,SOX5,cmpl,cmpl,"2,1,1,0,1,0,0,1,0,0,1,1,0,0,-1,-1,-1,-1,-1,-1,-1," +20573,20573,14,XM_047429662.1,chr12,+,43795801,44399108,43844651,44388672,9,"43795801,43795974,43844623,43944209,44143524,44211289,44299579,44376594,44388025,","43795820,43796150,43844928,43944342,44143624,44211387,44299739,44376724,44389758,",0,TMEM117,cmpl,cmpl,"-1,-1,0,1,2,0,2,0,1," +19861,19861,16,XM_024448962.2,chr12,-,61708272,62260034,61710405,62259544,6,"61708272,61753621,61754871,61867319,62258762,62259478,","61710417,61753746,61755024,61867426,62258842,62260034,",0,TAFA2,cmpl,cmpl,"0,1,1,2,0,0," +7487,7487,136,XM_005268754.5,chr12,-,66347430,67069338,66349018,67069107,25,"66347430,66353416,66371693,66377016,66377173,66379279,66392307,66392676,66394207,66406282,66420719,66432547,66444583,66445321,66455408,66456186,66462923,66465274,66515618,66517900,66529830,66539077,66541814,66596846,67069049,","66349246,66353563,66371927,66377061,66377285,66379436,66392502,66392816,66394352,66406428,66420789,66432628,66444729,66445508,66455564,66456342,66463093,66465422,66515764,66517976,66529914,66539223,66541950,66596927,67069338,",0,GRIP1,cmpl,cmpl,"0,0,0,0,2,1,1,2,1,2,1,1,2,1,1,1,2,1,2,1,1,2,1,1,0," +21121,21121,17,XM_017019244.2,chr12,+,72087265,72670758,72286810,72663195,20,"72087265,72105647,72286680,72377994,72469757,72473066,72499497,72542290,72562164,72562852,72568567,72575254,72575486,72618890,72621107,72621643,72652321,72653015,72656926,72663051,","72087439,72105752,72286954,72378121,72469912,72473180,72499635,72542356,72562230,72563040,72568656,72575388,72575542,72619038,72621205,72621751,72652489,72653156,72657008,72670758,",0,TRHDE,cmpl,cmpl,"-1,-1,0,0,1,0,0,0,0,0,2,1,0,2,0,2,2,2,2,0," +12886,12886,18,XM_047429826.1,chr12,+,77571861,78213010,77572194,78210517,35,"77571861,77940318,77941080,77966228,77968518,77998336,78006418,78021746,78049992,78050763,78058995,78116771,78118026,78119236,78128705,78137176,78140281,78146368,78148841,78159202,78168754,78175305,78177140,78179528,78180870,78185600,78188247,78188608,78189983,78197246,78198604,78199334,78200472,78204934,78210397,","77572266,77940436,77941133,77966301,77968702,77998476,78007445,78021862,78050101,78051147,78059115,78116904,78118297,78119945,78128866,78137365,78140334,78146392,78148919,78159286,78168866,78175427,78177313,78179682,78181045,78185698,78188343,78188777,78190219,78197401,78198676,78199531,78200591,78205138,78213010,",0,NAV3,cmpl,cmpl,"0,0,1,0,1,2,1,2,1,2,2,2,0,1,2,1,1,0,0,0,0,1,0,2,0,1,0,0,1,0,2,2,1,0,0," +19765,19765,148,NM_001135805.2,chr12,+,78863981,79452008,79217519,79449124,12,"78863981,78864997,78977798,79047296,79217502,79285786,79292007,79296068,79299383,79353501,79444072,79448917,","78864335,78865109,78977931,79047362,79217685,79285971,79292130,79296236,79299551,79353619,79444206,79452008,",0,SYT1,cmpl,cmpl,"-1,-1,-1,-1,0,1,0,0,0,0,1,0," +12048,12048,19,NM_001351285.2,chr12,-,85955666,86839000,85979288,85989546,9,"85955666,85983522,85989399,86049673,86334064,86435156,86495250,86727208,86838665,","85980430,85983670,85989552,86049723,86334127,86435265,86495315,86727241,86839000,",0,MGAT4C,cmpl,cmpl,"1,0,0,-1,-1,-1,-1,-1,-1," +820,820,20,XM_006719507.5,chr12,-,98734785,99984936,98735559,99984237,27,"98734785,98751354,98773041,98781116,98782125,98798933,98800996,98807843,98829173,98832028,99053156,99084924,99154288,99244341,99246274,99399630,99443672,99504475,99655066,99772921,99775547,99779870,99782021,99806403,99812154,99825308,99984103,","98735634,98751522,98773179,98781203,98782137,98799005,98801125,98807918,98829353,98832136,99053309,99085023,99154395,99244414,99246864,99399811,99443809,99504641,99655210,99773088,99775661,99779972,99782097,99806700,99812311,99825389,99984936,",0,ANKS1B,cmpl,cmpl,"0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,1,0,0,2,2,0," +20593,20593,196,NM_133448.3,chr12,-,129071725,129904025,129073874,129903339,9,"129071725,129078533,129081758,129084496,129209519,129337633,129531058,129699809,129903260,","129075059,129078725,129082032,129084702,129209663,129337817,129531205,129700698,129904025,",0,TMEM132D,cmpl,cmpl,"0,0,2,0,0,2,2,1,0," +2294,2294,73,XM_054332314.1,chr12_KZ208916v1_fix,+,311695,1046066,313206,1039315,47,"311695,457367,462468,797091,805682,834219,841305,852551,860927,898058,905066,914537,915684,929705,930937,932618,933491,933950,941328,943989,945552,949969,953196,953794,954726,955099,956626,958656,960018,981744,996590,999755,1001950,1003262,1012940,1013696,1014801,1017048,1022656,1025209,1025848,1027559,1030665,1033851,1034281,1036562,1039015,","313345,457689,462574,797231,805822,834378,841502,852655,861100,898149,905093,914698,915910,929913,931058,932733,933612,934020,941461,944119,945612,950076,953284,953902,954779,955246,956828,958815,960129,981828,996623,999884,1002016,1003354,1013106,1013824,1014898,1017151,1022758,1025337,1025983,1027912,1030794,1033958,1034385,1036895,1046066,",0,CACNA1C,cmpl,cmpl,"0,1,2,0,2,1,1,0,2,1,2,2,1,2,0,1,2,0,1,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,2,0,2,0,1,1,0,0,2,2,1,0,0," +1450,1450,97,NM_001411005.1,chr13,+,25371973,26025851,25372212,26019985,36,"25371973,25468976,25529998,25530561,25532271,25533272,25537987,25540318,25541918,25543290,25551337,25553792,25554990,25558972,25559720,25563955,25570766,25571609,25574807,25577068,25578814,25579807,25581818,25589634,25699172,25769045,25774848,25837162,25839545,25860194,25860803,25862300,25961574,25968574,26012530,26019887,","25372288,25469121,25530098,25530660,25532317,25533313,25538061,25540388,25542046,25543402,25551503,25553920,25555068,25559061,25559765,25564031,25570872,25571692,25574857,25577138,25578899,25579947,25581957,25589699,25699345,25769229,25774959,25837285,25839624,25860256,25860860,25862408,25961663,25968679,26012622,26025851,",0,ATP8A2,cmpl,cmpl,"0,1,2,0,0,1,0,2,0,2,0,1,0,0,2,2,0,1,0,2,0,1,0,1,0,2,0,0,0,1,0,0,0,2,2,1," +12591,12591,12,NM_001384605.1,chr13,+,28819962,29505947,29024698,29503206,16,"28819962,28839777,29024456,29033884,29100772,29281703,29324612,29359261,29439982,29480149,29487899,29492645,29497237,29498417,29501096,29502992,","28820285,28839850,29026903,29034125,29100970,29281865,29324711,29359473,29440049,29480364,29488005,29492719,29497336,29498537,29501194,29505947,",0,MTUS2,cmpl,cmpl,"-1,-1,0,0,1,1,1,1,0,1,0,1,0,0,0,2," +19501,19501,1,NM_001243476.3,chr13,-,33103136,33676794,33105592,33524267,18,"33103136,33106757,33109872,33110685,33111777,33112720,33118064,33126080,33127372,33128928,33142309,33165336,33167550,33439690,33524237,33661276,33663889,33676677,","33105710,33106934,33110090,33110907,33111892,33112931,33118263,33126240,33127546,33130289,33142373,33165418,33167622,33439724,33524372,33661343,33664137,33676794,",0,STARD13,cmpl,cmpl,"2,2,0,0,2,1,0,2,2,0,2,1,1,0,0,-1,-1,-1," +12892,12892,13,XM_047430269.1,chr13,+,34942269,35672736,34942820,35670991,58,"34942269,35040932,35044946,35045305,35048562,35050268,35056009,35058716,35069907,35070718,35098296,35109289,35110809,35117413,35118227,35118376,35123481,35142268,35155773,35156082,35157077,35159015,35161749,35164355,35171271,35173463,35176995,35182359,35183975,35195863,35208699,35211052,35232491,35290388,35309527,35349107,35352156,35432268,35452091,35472399,35550476,35550929,35554986,35566904,35583897,35593327,35606425,35628080,35645868,35646258,35649654,35651804,35654854,35655578,35665084,35667373,35668367,35670900,","34943114,35041164,35045047,35045401,35048684,35050395,35056129,35058863,35070105,35070852,35098405,35109442,35110978,35117493,35118290,35118474,35123574,35142377,35155855,35156206,35157270,35160032,35161967,35164509,35171452,35173594,35177103,35182528,35184071,35196302,35208836,35211179,35232619,35290450,35309592,35349216,35352323,35432393,35452235,35472536,35550594,35551032,35555102,35567017,35584038,35593447,35606578,35628248,35645931,35646348,35649847,35651876,35655010,35655749,35665186,35667570,35668519,35672736,",0,NBEA,cmpl,cmpl,"0,0,1,0,0,2,0,0,0,0,2,0,0,1,0,0,2,2,0,1,2,0,0,2,0,1,0,0,1,1,2,1,2,1,0,2,0,2,1,1,0,1,2,1,0,0,0,0,0,0,0,1,1,1,1,1,0,2," +5570,5570,114,NM_001347966.2,chr13,-,43213129,43786972,43213989,43412914,18,"43213129,43224052,43236635,43265397,43269469,43298345,43322383,43326418,43344537,43355918,43359850,43361278,43411915,43412844,43484008,43667478,43761728,43786651,","43214121,43224138,43236738,43265454,43269577,43298530,43322501,43326525,43344750,43356152,43360057,43361452,43412053,43412988,43484152,43667544,43761837,43786972,",0,ENOX1,cmpl,cmpl,"0,1,0,0,0,1,0,1,1,1,1,1,1,0,-1,-1,-1,-1," +14795,14795,0,NM_203487.3,chr13,-,66302833,67230336,66304654,67228440,5,"66302833,66631209,66903503,67225404,67229779,","66305028,66631411,66903605,67228575,67230336,",0,PCDH9,cmpl,cmpl,"1,0,0,0,-1," +9640,9640,143,NM_001400139.1,chr13,-,73686088,74306045,73695489,73995022,8,"73686088,73715367,73764937,73813151,73845826,73943980,73994989,74305995,","73695671,73715525,73765000,73813287,73846373,73944070,73995053,74306045,",0,KLF12,cmpl,cmpl,"1,2,2,1,0,0,0,-1," +7298,7298,2,NM_004466.6,chr13,+,91398620,92867237,91399046,92866439,8,"91398620,91448760,91693186,91728531,91756294,91907936,92144829,92866281,","91399209,91448922,91693881,91728665,91756420,91908057,92144989,92867237,",0,GPC5,cmpl,cmpl,"0,1,1,0,2,2,0,1," +7299,7299,20,XM_047429990.1,chr13,+,93216528,94408020,93545312,94403217,9,"93216528,93545262,93830153,94027728,94286348,94305979,94382413,94398465,94403014,","93226682,93545421,93830545,94027894,94286479,94306123,94382550,94398641,94408020,",0,GPC6,cmpl,cmpl,"-1,0,1,0,1,0,0,2,1," +8282,8282,20,NM_153456.4,chr13,+,96090106,96839562,96090862,96833198,2,"96090106,96832489,","96091569,96839562,",0,HS6ST3,cmpl,cmpl,"0,2," +6324,6324,21,NM_001321938.2,chr13,-,101710803,102402443,101722830,102220709,7,"101710803,101726611,101868724,101875185,102220696,102401470,102401953,","101722967,101726810,101868828,101875296,102220933,102401598,102402443,",0,FGF14,cmpl,cmpl,"1,0,1,1,0,-1,-1," +12844,12844,175,NM_001080396.3,chr13,-,107163509,107867496,107170496,107866596,3,"107163509,107210583,107865681,","107170786,107210755,107867496,",0,NALF1,cmpl,cmpl,"1,0,0," +12739,12739,2,XM_047430183.1,chr13,+,108495715,109208005,108665923,109206836,35,"108495715,108665885,108712660,108727439,108785634,108793515,108806678,108820336,108823124,108844345,108855442,108866176,108883058,108888371,108898015,108910002,108957687,108961538,108962423,108964760,108992375,109008896,109019710,109046915,109052299,109055045,109055389,109100784,109120369,109125111,109127281,109140263,109164900,109179541,109206608,","108495924,108666149,108712731,108727583,108785743,108793640,108806804,108820412,108823278,108844493,108855553,108866242,108883186,108888477,108898133,108910150,108957799,108961656,108962495,108964902,108992448,109009049,109019911,109046991,109052475,109055126,109055595,109100887,109120466,109125358,109127550,109141376,109165059,109179633,109208005,",0,MYO16,cmpl,cmpl,"-1,0,1,0,0,1,0,0,1,2,0,0,0,2,0,1,2,0,1,1,2,0,0,0,1,0,0,2,0,1,2,1,1,1,0," +13458,13458,1,XM_017021585.2,chr14,+,32934784,33804173,32934845,33801109,11,"32934784,33215181,33367185,33560120,33676210,33735213,33774336,33778465,33793896,33797456,33799733,","32935003,33215426,33367268,33560210,33676385,33735332,33774530,33778572,33794044,33797581,33804173,",0,NPAS3,cmpl,cmpl,"0,2,1,0,0,1,0,2,1,2,1," +11874,11874,14,NM_001113498.3,chr14,-,46839622,47675605,46841930,47674796,17,"46840091,46845765,46855023,46873432,46874044,46877488,46882043,46920011,46957373,47035010,47061248,47096853,47131713,47144077,47218020,47301410,47674516,","46842019,46845871,46855154,46873591,46874200,46877509,46882221,46920160,46957643,47035304,47061578,47097123,47131846,47144274,47218195,47301550,47675605,",0,MDGA2,cmpl,cmpl,"1,0,1,1,1,1,0,1,1,1,1,1,0,1,0,1,0," +7307,7307,0,XM_047430879.1,chr14,+,66508146,67735355,66508527,67735340,15,"66508146,66681106,66776463,66824473,66842677,66879938,66916002,66922665,66924193,66965190,66985681,66987935,67023632,67058648,67735194,","66508591,66681185,66776521,66824566,66842716,66880033,66916069,66922938,66924292,66965325,66985738,66988007,67023675,67058786,67735355,",0,GPHN,cmpl,cmpl,"0,1,2,0,0,0,2,0,0,0,0,0,0,1,1," +16685,16685,17,NM_001321818.2,chr14,+,67819778,68683096,67823543,68682944,11,"67819778,67823541,67825463,67835079,67865002,67885868,67887020,68291883,68411423,68468171,68682936,","67819853,67823627,67825577,67835196,67865139,67885988,67887204,68291980,68411527,68468250,68683096,",0,RAD51B,cmpl,cmpl,"-1,0,0,0,0,2,2,0,1,0,1," +17060,17060,17,XM_024449761.2,chr14,+,71867334,72630029,71964791,72563027,22,"71867334,71869925,71908164,71964771,72352094,72454527,72458270,72459631,72465757,72470006,72472871,72474624,72476741,72478267,72495151,72510153,72518350,72536185,72540040,72541469,72562416,72562714,","71867623,71869984,71908434,71964875,72352194,72454578,72458377,72459683,72465822,72470083,72472953,72474699,72476840,72478329,72495262,72510279,72518537,72536275,72540094,72541620,72562585,72566530,",0,RGS6,cmpl,cmpl,"-1,-1,-1,0,0,1,1,0,1,0,2,0,0,0,2,2,2,0,0,0,1,2," +13605,13605,18,XM_024449753.2,chr14,+,78170372,79868291,78243093,79861964,19,"78170372,78242390,78278644,78297830,78645119,78651164,78709216,78714755,78803619,78957241,78966024,78967207,78968172,78988021,79467220,79663777,79697629,79805111,79861662,","78170674,78243802,78278662,78297860,78645421,78651326,78709655,78715139,78803823,78957361,78966406,78967398,78968346,78988141,79467402,79663949,79697937,79805190,79868291,",0,NRXN3,cmpl,cmpl,"-1,0,1,1,1,0,0,1,1,1,1,2,1,1,1,0,1,0,1," +6709,6709,12,NM_033223.5,chr15,+,26971180,27541984,26971535,27532881,10,"26971180,26977001,27026753,27326808,27328805,27480649,27519971,27527432,27527932,27532599,","26971588,26977150,27026821,27327029,27328888,27480787,27520124,27527629,27527992,27541984,",0,GABRG3,cmpl,cmpl,"0,2,1,0,2,1,1,1,0,0," +5592,5592,12,NM_001387215.1,chr15,-,29117711,29675409,29118994,29252466,12,"29117711,29123344,29124686,29126266,29128788,29136365,29137033,29151733,29196429,29252367,29381774,29675098,","29119018,29123655,29124761,29126491,29128864,29136501,29137194,29151841,29196564,29252478,29381831,29675409,",0,ENTREP2,cmpl,cmpl,"0,1,1,1,0,2,0,0,0,0,-1,-1," +17644,17644,1,XM_017022476.2,chr15,+,33310966,33866102,33311045,33865226,102,"33310966,33473418,33530591,33533310,33539349,33540790,33543621,33548129,33550159,33562836,33566677,33579975,33581507,33584394,33585997,33601418,33603122,33613182,33623806,33628470,33629939,33631209,33632948,33634585,33635613,33636375,33644310,33646350,33647423,33649071,33652717,33659719,33660196,33662152,33663536,33669353,33670418,33696217,33697881,33699703,33700976,33706918,33722714,33724064,33726385,33728856,33731473,33736234,33738449,33739831,33742365,33746067,33748113,33748467,33749978,33750162,33755064,33756305,33757474,33768657,33769111,33771919,33773533,33780210,33785661,33788217,33800769,33801868,33810478,33810977,33812862,33813466,33815833,33818577,33819755,33820755,33821269,33821522,33822995,33825602,33826251,33826671,33827198,33830962,33834967,33836905,33837630,33840824,33841863,33843487,33844861,33848290,33853044,33853554,33854388,33854765,33857779,33859574,33860594,33861077,33864137,33865130,","33311096,33473538,33530666,33533389,33539462,33540890,33543715,33548204,33550316,33563010,33566799,33580144,33581643,33584490,33586116,33601552,33603364,33613375,33624023,33628575,33630043,33631293,33633108,33634733,33635819,33636550,33644519,33646526,33647460,33649235,33652883,33659806,33660423,33662948,33663737,33669456,33670556,33696491,33697996,33699833,33701080,33707054,33722895,33724176,33726506,33729026,33731694,33736325,33738590,33739995,33742444,33746157,33748260,33748530,33750054,33750286,33755180,33756373,33757596,33768707,33769172,33772158,33773615,33780341,33785982,33788458,33800857,33801961,33810649,33811037,33812994,33813579,33815933,33818684,33819807,33820812,33821369,33821602,33823072,33825676,33826269,33826752,33827287,33831091,33835072,33836987,33838958,33840883,33842035,33843574,33845062,33848421,33853087,33853682,33854449,33854912,33857914,33859731,33860659,33861178,33864189,33866102,",0,RYR3,cmpl,cmpl,"0,0,0,0,1,0,1,2,2,0,0,2,0,1,1,0,2,1,2,0,0,2,2,0,1,0,1,0,2,0,2,0,0,2,0,0,1,1,2,0,1,0,1,2,0,1,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,2,1,2,1,1,2,0,0,0,0,0,2,0,2,0,0,1,0,2,1,1,1,0,0,0,1,0,2,0,0,0,2,0,2,0,0,0,1,0,2,0," +17982,17982,14,NM_001198999.2,chr15,+,47184088,47774228,47759798,47771785,20,"47184088,47412392,47470473,47600864,47759744,47760303,47760977,47761157,47761329,47761660,47762199,47763015,47763849,47764173,47764637,47764882,47765868,47766104,47766576,47770496,","47184418,47412472,47470545,47600896,47759907,47760415,47761038,47761220,47761431,47761751,47762319,47763104,47764067,47764305,47764793,47765056,47766009,47766182,47766677,47774221,",0,SEMA6D,cmpl,cmpl,"-1,-1,-1,-1,0,1,2,0,0,0,1,1,0,2,2,2,2,2,2,1," +21794,21794,15,XM_017022222.2,chr15,+,53837601,54633440,54012903,54627113,35,"53837601,53876877,53878173,54012647,54143017,54143619,54235029,54236429,54237618,54250224,54264167,54265354,54293894,54297810,54300209,54321938,54332042,54333766,54338360,54393047,54414981,54494607,54500078,54500834,54507116,54511752,54525548,54532916,54546721,54549634,54555431,54567799,54622326,54623794,54626827,","53837659,53877058,53878298,54015886,54143040,54143684,54235108,54236435,54237690,54250444,54264395,54265496,54294064,54297926,54300373,54322095,54332111,54333856,54338489,54393181,54415067,54494734,54500175,54500978,54507194,54511830,54525637,54533066,54546845,54549691,54555512,54567947,54622419,54623954,54633440,",0,UNC13C,cmpl,cmpl,"-1,-1,-1,0,1,0,2,0,0,0,1,1,2,1,0,2,0,0,0,0,2,1,2,0,0,0,0,2,2,0,0,0,1,1,2," +17333,17333,16,XM_047432928.1,chr15,-,60488283,61229302,60497454,60514628,11,"60488283,60499891,60500958,60502759,60503534,60505507,60511225,60514615,60531765,60677150,61229052,","60497619,60500004,60501069,60502867,60503667,60505629,60511621,60514757,60531851,60678686,61229302,",0,RORA,cmpl,cmpl,"0,1,1,1,0,1,1,0,-1,-1,-1," +20382,20382,17,NM_001394532.1,chr15,+,71096893,71783383,71141527,71777374,18,"71096893,71141448,71154862,71215034,71242648,71256612,71411686,71660529,71728548,71731120,71737731,71745105,71746837,71748420,71757901,71765019,71771063,71777231,","71097006,71141556,71154932,71215399,71243096,71256715,71411823,71660734,71728724,71731217,71738007,71745235,71747042,71748594,71758075,71765199,71771208,71783383,",0,THSD4,cmpl,cmpl,"-1,0,2,0,2,0,1,0,1,0,1,1,2,0,0,0,0,1," +17780,17780,18,NM_001353010.2,chr15,-,76347903,76905340,76348632,76804624,33,"76347903,76351236,76353948,76376161,76381377,76404523,76434077,76471211,76504858,76574157,76621763,76665652,76701757,76702849,76705902,76728594,76733228,76753807,76764960,76765336,76765562,76766917,76771741,76774854,76795279,76800247,76804532,76841733,76851167,76857808,76862415,76883811,76905192,","76348736,76351288,76354140,76376311,76381615,76404679,76434310,76471335,76504974,76574284,76621829,76665789,76701865,76703002,76705984,76728737,76733384,76753948,76765072,76765454,76765638,76767088,76771954,76775117,76795440,76800364,76804633,76841931,76851222,76857879,76862533,76883876,76905340,",0,SCAPER,cmpl,cmpl,"1,0,0,0,2,2,0,2,0,2,2,0,0,0,2,0,0,0,2,1,0,0,0,1,2,2,0,-1,-1,-1,-1,-1,-1," +465,465,155,XM_011521227.4,chr15,+,86079619,87031476,86079972,87028840,23,"86079619,86142003,86143698,86154429,86158932,86224913,86247670,86256852,86257963,86262777,86264257,86266373,86266989,86269918,86271618,86279638,86295254,86397365,86522809,86546001,86554360,86674272,87028824,","86080023,86142067,86143845,86154561,86159026,86224951,86247879,86257018,86258031,86262894,86264838,86266457,86267076,86270067,86271706,86279783,86295408,86397546,86522939,86546133,86554537,86674436,87031476,",0,AGBL1,cmpl,cmpl,"0,0,1,1,1,2,1,0,1,0,0,2,2,2,1,2,0,1,2,0,0,0,2," +5593,5593,74,NM_001387215.1,chr15_KI270905v1_alt,-,1392758,1959533,1394045,1528000,12,"1392758,1397973,1399315,1400895,1403417,1411936,1412604,1427301,1471992,1527901,1657276,1959222,","1394069,1398284,1399390,1401120,1403493,1412072,1412765,1427409,1472127,1528012,1657333,1959533,",0,ENTREP2,cmpl,cmpl,"0,1,1,1,0,2,0,0,0,0,-1,-1," +5594,5594,74,NM_001387215.1,chr15_KN538374v1_fix,-,1280274,1847049,1281561,1415516,12,"1280274,1285489,1286831,1288411,1290933,1299452,1300120,1314817,1359508,1415417,1544792,1846738,","1281585,1285800,1286906,1288636,1291009,1299588,1300281,1314925,1359643,1415528,1544849,1847049,",0,ENTREP2,cmpl,cmpl,"0,1,1,1,0,2,0,0,0,0,-1,-1," +16812,16812,9,NM_001415888.1,chr16,+,5239720,7713340,5239766,7710745,18,"5239720,5467215,5598901,5867302,6316994,6654602,7052056,7518146,7579776,7587246,7595548,7597370,7607284,7653814,7664928,7676773,7709055,7710622,","5240105,5467254,5598961,5867335,6317057,6654650,7052098,7518389,7579920,7587300,7595641,7597431,7607338,7653947,7664968,7676838,7709131,7713340,",0,RBFOX1,cmpl,cmpl,"0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,0,2,0," +19019,19019,84,NM_032167.5,chr16,+,11976733,12574287,11976806,12568629,21,"11976733,11999296,12002990,12027319,12042896,12046383,12048371,12051846,12061527,12069056,12078832,12126632,12129629,12199600,12277932,12356162,12398445,12403447,12477718,12524701,12568505,","11976813,11999358,12003043,12027444,12043077,12046454,12048620,12052222,12061646,12069132,12078915,12126696,12129758,12199683,12278036,12356279,12398501,12403529,12477859,12524841,12574287,",0,SNX29,cmpl,cmpl,"0,1,0,2,1,2,1,1,2,1,2,1,2,2,1,0,0,2,0,0,2," +18231,18231,85,XM_011522642.3,chr16,+,12901597,13474651,12901941,13474248,5,"12901597,12916687,13203393,13350179,13474209,","12902627,12916815,13203549,13350421,13474651,",0,SHISA9,cmpl,cmpl,"0,2,1,1,0," +22592,22592,17,NM_001386735.1,chr16,-,72782884,73891930,72787163,72960145,17,"72782884,72793254,72800026,72811576,72811904,72829778,72889730,72950468,72957426,73093234,73130967,73143751,73257046,73318239,73456002,73680179,73891650,","72788848,72798714,72800129,72811777,72812038,72829859,72889962,72950965,72960194,73093598,73131094,73143831,73257136,73318336,73456258,73680240,73891930,",0,ZFHX3,cmpl,cmpl,"1,1,0,0,1,1,0,1,0,-1,-1,-1,-1,-1,-1,-1,-1," +22333,22333,18,NM_001291997.2,chr16,+,78099653,79212667,78115084,79211796,8,"78099653,78109777,78114975,78164182,78386859,78424869,78432487,79211607,","78099885,78109835,78115154,78164289,78386948,78425055,78432752,79212667,",0,WWOX,cmpl,cmpl,"-1,-1,0,1,0,2,2,0," +2945,2945,18,NM_001220490.2,chr16,+,82626968,83800640,83344987,83795030,13,"82626968,82858361,83125384,83217344,83344861,83486476,83602453,83670789,83678207,83748107,83779967,83783253,83795022,","82627137,82858473,83125501,83217497,83345006,83486655,83602594,83670972,83678461,83748250,83780201,83783472,83800640,",0,CDH13,cmpl,cmpl,"-1,-1,-1,-1,0,1,0,0,0,2,1,1,1," +1260,1260,1,NM_001094.5,chr17,-,33013086,34156768,33013964,34156532,10,"33013086,33015970,33017604,33021218,33023860,33025925,33028241,33088862,33111916,34155977,","33014066,33016039,33017684,33021310,33024014,33025982,33028392,33088990,33112067,34156768,",0,ASIC2,cmpl,cmpl,"0,0,1,2,1,1,0,1,0,0," +11247,11247,117,XM_024450773.2,chr17,+,46372791,47049128,46512712,47049061,11,"46512467,46517361,46520211,46523083,46523803,46540175,46540806,46546254,46548311,46550414,47049055,","46515321,46517433,46520283,46523155,46523884,46540247,46540881,46546373,46549843,46550519,47049128,",0,LRRC37A2,cmpl,cmpl,"0,2,2,2,2,2,2,2,1,0,0," +2260,2260,122,NM_001082534.2,chr17,-,51630312,52160023,51631583,52157786,10,"51630319,51633475,51635854,51649181,51653640,51747632,51930989,52072318,52157725,52159923,","51631606,51633650,51636009,51649254,51653736,51747818,51931132,52072393,52158369,52160023,",0,CA10,cmpl,cmpl,"1,0,1,0,0,0,1,1,0,-1," +1627,1627,16,NM_017679.5,chr17,+,60677850,61392831,60679457,61392125,24,"60677850,60679452,60683981,60689685,60709218,60747197,60808003,60868575,60874661,60889694,60902619,60910531,60924406,60947218,60989970,61015750,61034665,61037888,61040791,61074919,61078332,61084466,61368326,61391976,","60677914,60679540,60684036,60689761,60709325,60747279,60808076,60868683,60874738,60889771,60902703,60910702,60924500,60947352,60990235,61015901,61034790,61038054,61040892,61075020,61078529,61084564,61368494,61392831,",0,BCAS3,cmpl,cmpl,"-1,0,2,0,1,0,1,2,2,1,0,0,0,1,0,1,2,1,2,1,0,2,1,1," +3122,3122,16,NM_001353129.2,chr17,-,65635536,66192133,65635970,66183299,27,"65635536,65637123,65640963,65689128,65743067,65750661,65851803,65902151,65927581,65961462,66005689,66027500,66028312,66029122,66029868,66053735,66062962,66066777,66069914,66096250,66096584,66129745,66132669,66175043,66176829,66183193,66191996,","65635974,65637188,65641065,65689218,65743217,65750724,65852034,65902334,65927689,65961598,66005769,66027560,66028405,66029252,66030023,66053879,66063081,66066877,66070001,66096331,66096632,66129823,66132763,66175216,66177020,66183307,66192133,",0,CEP112,cmpl,cmpl,"2,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,1,0,0,0,0,0,2,0,1,0,-1," +16815,16815,148,XM_024450593.2,chr17,-,79089344,79665571,79092596,79115715,17,"79091946,79094450,79095512,79096652,79097291,79097691,79101583,79103161,79104072,79106650,79115493,79235765,79307723,79482453,79644136,79646195,79665370,","79092641,79094529,79095574,79096833,79097424,79097745,79101644,79103254,79104126,79106788,79115748,79235805,79307824,79482598,79644242,79646334,79665571,",0,RBFOX3,cmpl,cmpl,"0,2,0,2,1,1,0,0,0,0,0,-1,-1,-1,-1,-1,-1," +4865,4865,9,XM_024451288.2,chr18,-,3496031,4455307,3499184,3880068,15,"3496031,3502492,3508569,3534193,3567489,3581874,3656083,3729134,3742334,3814058,3879111,4005115,4106580,4151179,4455005,","3499394,3502645,3508661,3534615,3567581,3582248,3656113,3729375,3742512,3814273,3880140,4005201,4107046,4151287,4455307,",0,DLGAP1,cmpl,cmpl,"0,0,1,2,0,1,1,0,2,0,0,-1,-1,-1,-1," +16475,16475,1,NM_002845.4,chr18,+,7567315,8406856,7567818,8406162,31,"7567315,7774148,7888105,7906504,7926567,7949180,7955120,8069685,8076454,8085670,8088748,8113485,8114790,8143646,8244057,8247844,8253226,8296367,8314780,8319177,8343422,8370889,8376045,8376461,8378264,8379166,8380295,8384560,8387071,8394475,8406108,","7567891,7774271,7888377,7906583,7926683,7949355,7955414,8069994,8076564,8085872,8088851,8113759,8114827,8143779,8244209,8247919,8253414,8296455,8314857,8319214,8343520,8371006,8376200,8376597,8378414,8379340,8380427,8384686,8387235,8394611,8406856,",0,PTPRM,cmpl,cmpl,"0,1,1,0,1,0,1,1,1,0,1,2,0,1,2,1,1,0,1,0,1,0,0,2,0,0,0,0,0,2,0," +4493,4493,15,XM_047437311.1,chr18,+,52340196,53535899,52340787,53530653,29,"52340196,52752053,52906043,52923706,52925233,53063304,53066045,53157355,53178961,53205215,53207678,53215547,53305577,53322046,53339712,53386042,53391714,53397307,53402785,53410451,53416123,53435143,53450499,53459231,53467893,53486796,53499297,53526616,53530563,","52340878,52752374,52906328,52923857,52925370,53063459,53066166,53157512,53179116,53205364,53207817,53215597,53305719,53322157,53339907,53386138,53391887,53397446,53402893,53410646,53416156,53435209,53450662,53459458,53468010,53486952,53499510,53526759,53535899,",0,DCC,cmpl,cmpl,"0,1,1,1,2,1,0,1,2,1,0,1,0,1,1,1,1,0,1,1,1,1,1,2,1,1,1,1,0," +12786,12786,9,XM_017003604.2,chr2,-,1789112,2331275,1791866,1979777,27,"1789112,1792320,1801689,1809075,1839148,1840759,1851640,1886538,1887058,1887487,1889240,1892036,1903079,1910239,1912019,1917204,1922285,1942981,1979164,1979520,1979722,1997190,2053977,2172871,2284403,2326035,2330966,","1792007,1792464,1801799,1809167,1839370,1840843,1851703,1886607,1887121,1887609,1889477,1892287,1903294,1910347,1912110,1917339,1923263,1943334,1979227,1979554,1979777,1997347,2054123,2172988,2284503,2326111,2331275,",0,MYT1L,cmpl,cmpl,"0,0,1,2,2,2,2,2,2,0,0,1,2,2,1,1,1,2,2,1,0,-1,-1,-1,-1,-1,-1," +12890,12890,87,XM_017004317.2,chr2,-,14998066,15561334,15137403,15561304,53,"14998066,15137355,15178987,15186741,15190263,15218772,15232421,15234544,15238467,15275483,15276850,15287072,15292536,15308215,15309170,15327749,15328198,15330597,15351991,15353552,15356302,15366579,15374607,15379601,15383214,15394226,15396412,15402167,15415545,15417526,15424314,15427710,15461200,15461686,15467328,15467663,15468381,15473221,15474066,15475686,15478225,15488893,15504144,15511211,15534542,15536417,15539222,15551492,15553425,15554060,15556782,15558579,15561187,","14998224,15137445,15179116,15186880,15190403,15218968,15232511,15234747,15238686,15275818,15277101,15287183,15292766,15308353,15309247,15327870,15328312,15330765,15352081,15353710,15356416,15366693,15374720,15379831,15383317,15394349,15396475,15402301,15415719,15417712,15424468,15427794,15461337,15461791,15467407,15467804,15468533,15473347,15474324,15475880,15478289,15489022,15504213,15511350,15534641,15536551,15539356,15551536,15553473,15554138,15556819,15558634,15561334,",0,NBAS,cmpl,cmpl,"-1,0,0,2,0,2,2,0,0,1,2,2,0,0,1,0,0,0,0,1,1,1,2,0,2,2,2,0,0,0,2,2,0,0,2,2,0,0,0,1,0,0,0,2,2,0,1,2,2,2,1,0,0," +643,643,12,NM_004304.5,chr2,-,29192773,29921586,29193223,29920659,29,"29192773,29196769,29197541,29207170,29209785,29213983,29220705,29222343,29222516,29223341,29225460,29226921,29227573,29228883,29232303,29233564,29239679,29251104,29275098,29275401,29296887,29318303,29320750,29328349,29383731,29531914,29694849,29717577,29919992,","29193922,29196860,29197676,29207272,29209878,29214081,29220835,29222408,29222607,29223528,29225565,29227074,29227672,29229066,29232448,29233696,29239830,29251267,29275227,29275496,29297057,29318404,29320882,29328481,29383859,29532116,29695014,29717697,29921586,",0,ALK,cmpl,cmpl,"0,2,2,2,2,0,2,0,2,1,1,1,1,1,0,0,2,1,1,2,0,1,1,1,2,1,1,1,0," +16059,16059,14,XM_005264428.2,chr2,+,45651278,46187990,45652100,46184881,16,"45651278,45651903,45842999,45976428,45978975,45980295,45984550,46001403,46004541,46007461,46010343,46086207,46145092,46151040,46159605,46184734,","45651703,45652448,45843063,45976588,45979010,45980381,45984680,46001546,46004638,46007661,46010517,46086362,46145231,46151229,46159752,46187990,",0,PRKCE,cmpl,cmpl,"-1,0,0,1,2,1,0,1,0,1,0,0,2,0,0,0," +13603,13603,1,NM_001135659.3,chr2,-,49918502,51032132,49921943,51028273,24,"49918502,49943703,50053270,50054954,50091322,50236788,50465441,50472297,50495904,50497332,50506494,50528624,50531226,50538252,50552586,50620021,50621204,50623313,50921868,50922657,50925937,51026370,51027501,51031980,","49922251,49943791,50053590,50055044,50091494,50236970,50465561,50472471,50496095,50497714,50506617,50528651,50531430,50538636,50553025,50620183,50621249,50623615,50921880,50922687,50925955,51026469,51029194,51032132,",0,NRXN1,cmpl,cmpl,"1,0,1,1,0,1,1,1,2,1,1,1,1,1,0,0,0,1,1,1,1,1,0,-1," +22196,22196,133,XM_047444626.1,chr2,-,63119558,63840826,63121820,63571377,21,"63119558,63152913,63174669,63259306,63313247,63378385,63381905,63404047,63433744,63437420,63439756,63484603,63484916,63486541,63487446,63492855,63571374,63650658,63673866,63813621,63840042,","63122056,63152945,63174832,63259409,63313311,63378509,63382094,63404657,63433936,63437554,63439871,63484663,63484987,63486586,63487494,63492940,63571626,63650838,63678253,63813707,63840826,",0,WDPCP,cmpl,cmpl,"1,2,1,0,2,1,1,0,0,1,0,0,1,1,1,0,0,-1,-1,-1,-1," +5817,5817,17,NM_001321731.2,chr2,-,72175983,72826033,72182889,72825910,23,"72175983,72182852,72184074,72334946,72379728,72465159,72480615,72492317,72495429,72496453,72498453,72499900,72513131,72514633,72515042,72559452,72575491,72718102,72731006,72731154,72733070,72741303,72825797,","72179461,72182905,72184187,72335020,72379870,72465339,72480750,72492429,72495539,72496559,72498551,72499972,72513252,72514680,72515126,72559521,72575668,72718307,72731052,72731245,72733118,72741469,72826033,",0,EXOC6B,cmpl,cmpl,"-1,2,0,1,0,0,0,2,0,2,0,0,2,0,0,0,0,2,1,0,0,2,0," +11314,11314,146,NM_001134745.3,chr2,-,76747684,77522376,76748694,77521671,4,"76747684,77518317,77521667,77522108,","76748916,77519864,77521818,77522376,",0,LRRTM4,cmpl,cmpl,"0,1,0,-1," +4198,4198,18,NM_001399737.1,chr2,+,79185376,80648780,79651556,80647872,22,"79185376,79197958,79312708,79373830,79505053,79651551,79744386,79858012,79869815,79874075,79909593,80393210,80419448,80544981,80545906,80555692,80574162,80581705,80589303,80604073,80608183,80647584,","79185431,79198076,79312796,79374013,79505182,79651658,79744582,79858179,79869935,79874342,79909797,80393291,80419601,80545074,80546063,80555893,80574314,80581819,80589485,80604179,80608318,80648780,",0,CTNNA2,cmpl,cmpl,"-1,-1,-1,-1,-1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,2,0,0," +447,447,20,XM_047444279.1,chr2,-,99545418,100142590,99551473,100110104,28,"99545418,99554310,99554682,99558874,99560364,99565486,99568851,99578326,99582797,99587153,99593194,99601434,99649625,99672537,99727076,99744103,99752220,99837476,100006631,100007147,100008811,100011496,100104401,100105503,100110053,100112385,100129223,100142483,","99551595,99554534,99554732,99558968,99560436,99565623,99568915,99578451,99582999,99587278,99594289,99601621,99649666,99672589,99727128,99744140,99752301,99837524,100007017,100007460,100008932,100011571,100104518,100105583,100110333,100112524,100129306,100142590,",0,AFF3,cmpl,cmpl,"1,2,0,2,2,0,2,0,2,0,0,2,0,2,1,0,0,0,1,0,2,2,2,0,0,-1,-1,-1," +16726,16726,2,XM_047445367.1,chr2,+,108719481,109842301,108719606,109841457,25,"108719481,108729131,108730773,108731321,108735531,108736103,108740488,108746710,108748919,108751263,108751527,108751870,108752997,108753425,108753824,108754904,108755175,108758412,108762100,108763236,108769259,108771700,108772488,108772867,109841403,","108719678,108729199,108730885,108731474,108735762,108736249,108740681,108746798,108749129,108751445,108751703,108751994,108753159,108753563,108753971,108755084,108755259,108758548,108762195,108768388,108769337,108771871,108772581,108773046,109842301,",0,RANBP2,cmpl,cmpl,"0,0,2,0,0,0,2,0,1,1,0,2,0,0,0,0,0,0,1,0,1,1,1,1,0," +5065,5065,22,NM_001321913.3,chr2,+,114442640,115845780,115739803,115842345,25,"114442640,115343816,115499509,115525897,115689686,115689839,115727815,115739738,115746085,115753173,115762571,115768296,115777207,115777786,115780873,115782351,115791080,115791286,115814792,115815674,115836156,115836506,115836673,115840749,115842210,","114442838,115343912,115499604,115525972,115689739,115689921,115727936,115739893,115746183,115753297,115762610,115768404,115777299,115777834,115780995,115782399,115791179,115791356,115814987,115815729,115836256,115836565,115836746,115840823,115845780,",0,DPP10,cmpl,cmpl,"-1,-1,-1,-1,-1,-1,-1,0,0,2,0,0,0,2,2,1,1,1,2,2,0,1,0,1,0," +3714,3714,23,NM_130773.4,chr2,+,124025286,124921219,124025650,124914288,24,"124025286,124221704,124242199,124417442,124434483,124446752,124474738,124504291,124524302,124527284,124563216,124609800,124647757,124747228,124763671,124763976,124772798,124789901,124798095,124865305,124869674,124902881,124911466,124914091,","124025732,124221809,124242393,124417590,124434687,124446937,124474879,124504556,124524452,124527456,124563323,124609920,124647958,124747385,124763799,124764147,124773017,124790141,124798320,124865436,124869762,124903100,124911538,124921219,",0,CNTNAP5,cmpl,cmpl,"0,1,1,0,1,1,0,0,1,1,2,1,1,1,2,1,1,1,1,1,0,1,1,1," +12943,12943,24,XM_011511100.4,chr2,-,132671787,133674836,132673023,133517526,19,"132671787,132728815,132731736,132773815,132781051,132781939,132790022,132796627,132860491,132868935,132878847,132963719,132994151,133129977,133213715,133303036,133517457,133559049,133674673,","132673305,132728952,132732051,132773894,132781229,132785718,132790205,132796729,132860611,132868974,132878916,132963869,132994239,133130111,133213779,133303110,133517587,133559117,133674836,",0,NCKAP5,cmpl,cmpl,"0,1,1,0,2,0,0,0,0,0,0,0,2,0,2,0,0,-1,-1," +20384,20384,25,NM_001316349.2,chr2,+,136765544,137677718,136882178,137676605,28,"136765544,136882143,137056419,137094872,137115123,137160212,137170740,137231043,137232898,137242456,137272532,137275922,137405612,137411608,137450844,137563220,137572405,137616174,137618391,137620608,137642487,137655500,137656795,137657064,137659663,137663382,137667773,137676523,","136765687,136882317,137057230,137095121,137115293,137160368,137170938,137231235,137233133,137242572,137272662,137276026,137405807,137411872,137451023,137563354,137572556,137616316,137618507,137620726,137642633,137655660,137656969,137657160,137659746,137663575,137667861,137677718,",0,THSD7B,cmpl,cmpl,"-1,0,1,2,2,1,1,1,1,2,1,2,1,1,1,0,2,0,1,0,1,0,1,1,1,0,1,2," +11210,11210,25,NM_018557.3,chr2,-,140231422,142131016,140233185,142130729,91,"140231422,140234785,140238151,140239441,140247085,140270241,140274423,140297807,140314934,140321962,140323892,140325761,140334452,140335614,140350796,140352952,140356341,140357978,140358820,140364660,140370709,140371178,140373007,140378179,140385892,140442503,140444329,140444562,140450567,140456454,140457462,140475137,140485342,140487616,140492607,140495564,140501686,140502962,140506795,140509927,140514652,140516888,140525843,140526236,140534020,140536580,140540972,140541778,140598630,140601449,140700249,140701720,140702140,140702426,140715972,140716681,140769212,140770880,140776097,140813656,140839990,140840917,140850101,140851651,140867589,140868098,140883816,140886137,140902919,140907876,140922964,140950234,140951859,140982159,140989531,140993994,141005334,141013555,141015695,141019921,141048985,141055115,141058882,141062050,141188420,141229182,141247225,141254521,141480395,141810278,142130647,","140233326,140234884,140238296,140239532,140247162,140270346,140274598,140297969,140315099,140322088,140324066,140325878,140334559,140335838,140351038,140353072,140356476,140358116,140358946,140364783,140370842,140371285,140373137,140378286,140386009,140442623,140444449,140444679,140450661,140456603,140457651,140475337,140485524,140487739,140492693,140495748,140501874,140503103,140506918,140510056,140514772,140517011,140525993,140526350,140534140,140536709,140541098,140541971,140598835,140601639,140700621,140701845,140702292,140702553,140716102,140716816,140769344,140771006,140776238,140813806,140840085,140841092,140850329,140851783,140867834,140868263,140884021,140886335,140903165,140908077,140923147,140950402,140951940,140982276,140989657,140994135,141005457,141013745,141015915,141020102,141049222,141055259,141059054,141062273,141188583,141229440,141247354,141254641,141480533,141810401,142131016,",0,LRP1B,cmpl,cmpl,"0,0,2,1,2,2,1,1,1,1,1,1,2,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,2,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,2,0,2,1,1,1,1,1,1,2,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,2,1,1,1,1,1,1,0," +1039,1039,26,XM_017004500.3,chr2,+,143129418,143768352,143129430,143768172,13,"143129418,143202133,143216383,143228580,143250510,143435600,143436912,143487372,143519265,143556407,143624132,143703418,143767988,","143129466,143202202,143216445,143228668,143250600,143435699,143437042,143487495,143519364,143556485,143624267,143703524,143768352,",0,ARHGAP15,cmpl,cmpl,"0,0,0,2,0,0,0,1,1,1,1,1,2," +6752,6752,27,NM_001422883.1,chr2,+,153068292,154456574,154242762,154450551,17,"153068292,153078339,153311474,153332573,153333236,153732768,153900935,153944393,154242029,154242697,154245811,154259020,154301408,154395990,154408983,154438591,154450410,","153068361,153078454,153311933,153332640,153333327,153732830,153901007,153944639,154242196,154242905,154245982,154259138,154301589,154396130,154409082,154438726,154456574,",0,GALNT13,cmpl,cmpl,"-1,-1,-1,-1,-1,-1,-1,-1,-1,0,2,2,0,1,0,0,0," +1512,1512,3,NM_020981.4,chr2,+,167293000,167874045,167869039,167870020,5,"167293000,167490176,167646908,167818671,167868810,","167293334,167490277,167646966,167818793,167874045,",0,B3GALT1,cmpl,cmpl,"-1,-1,-1,-1,0," +14949,14949,30,XM_047444688.1,chr2,-,182140040,182716797,182147078,182716764,16,"182140040,182147035,182185891,182186467,182188978,182201438,182201687,182205939,182223863,182230005,182231014,182234431,182240109,182264300,182522275,182716690,","182143038,182147152,182186079,182186588,182189060,182201559,182201789,182206065,182223964,182230146,182231131,182234498,182240292,182264414,182522386,182716797,",0,PDE1A,cmpl,cmpl,"-1,1,2,1,0,2,2,2,0,0,0,2,2,2,2,0," +14703,14703,33,NM_152526.6,chr2,+,204545474,205620162,204545999,205615813,22,"204545474,204686180,204965151,205047580,205104425,205113490,205118920,205121590,205124326,205125608,205172210,205176444,205185763,205193204,205245777,205300529,205301463,205401012,205440369,205499895,205553323,205615455,","204546119,204686282,204965323,205047690,205104514,205113577,205119046,205121949,205124466,205125737,205172381,205176577,205185863,205193320,205245822,205300736,205301701,205401123,205440672,205500031,205553403,205620162,",0,PARD3B,cmpl,cmpl,"0,0,0,1,0,2,2,2,1,0,0,0,1,2,1,1,1,2,2,2,0,2," +5677,5677,34,XM_017003577.3,chr2,-,211375716,212538802,211383614,212538530,30,"211375716,211386852,211387944,211420440,211422006,211424154,211428407,211430944,211561902,211619176,211623921,211630461,211657753,211657949,211665322,211673163,211679051,211701966,211704103,211705317,211712049,211713534,211722392,211725075,211750638,211781457,211788024,211947429,212124751,212538448,","211384060,211387150,211387992,211420611,211422104,211424301,211428483,211431100,211562088,211619275,211624044,211630594,211657828,211657994,211665477,211673257,211679184,211702166,211704194,211705391,211712176,211713648,211722534,211725194,211750704,211781535,211788159,211947616,212124903,212538802,",0,ERBB4,cmpl,cmpl,"1,0,0,0,1,1,0,0,0,0,0,2,2,2,0,2,1,2,1,2,1,1,0,1,1,1,1,0,1,0," +19118,19118,34,XM_011511820.3,chr2,+,213284463,214410501,213284483,214410315,15,"213284463,213296063,213297261,213310058,213317218,213340162,213350527,213364075,213375009,213489962,213862484,214013950,214108195,214149139,214410139,","213284619,213296110,213297357,213310177,213317356,213340270,213350645,213364145,213375119,213490090,213862628,214014077,214108261,214149266,214410501,",0,SPAG16,cmpl,cmpl,"0,1,0,0,2,2,2,0,1,0,2,2,0,0,1," +458,458,37,NM_014914.5,chr2,+,235494042,236131793,235494686,236124122,17,"235494042,235709178,235717556,235740962,235744697,235750353,235797758,235799366,235807238,235883344,235908737,235968461,236036560,236040750,236049058,236120191,236123918,","235494849,235709237,235717644,235741048,235744839,235750488,235797886,235799522,235807331,235883449,235908906,235968623,236036715,236040841,236049281,236120447,236131793,",0,AGAP1,cmpl,cmpl,"0,1,0,1,0,1,1,0,0,0,0,1,1,0,1,2,0," +15393,15393,1,NM_015192.4,chr20,+,8132265,8884900,8132651,8881849,32,"8132265,8150293,8371381,8628293,8646101,8647899,8649373,8657183,8658537,8684931,8697625,8708669,8716263,8717670,8722353,8724655,8727308,8729049,8733237,8737027,8739260,8740343,8741463,8757045,8760406,8765138,8774538,8788448,8788632,8789517,8790174,8881621,","8132750,8150371,8371450,8628431,8646181,8647953,8649449,8657284,8658704,8685078,8697783,8708752,8716348,8717848,8722421,8724752,8727393,8729174,8733392,8737192,8739360,8740448,8741573,8757178,8760460,8765358,8774719,8788525,8788722,8789575,8790261,8884900,",0,PLCB1,cmpl,cmpl,"0,0,0,0,0,2,2,0,2,1,1,0,2,0,1,0,1,2,1,0,0,1,1,0,1,1,2,0,2,2,0,0," +11512,11512,10,NM_080676.6,chr20,+,13995515,16053197,13995763,16049876,17,"13995515,14002287,14085620,14493478,14684842,15229939,15431404,15499773,15862744,15885763,15933275,15937475,15967552,15986726,15987065,16041200,16049829,","13995809,14002404,14085728,14493508,14684959,15230061,15431435,15499847,15862826,15885811,15933338,15937544,15967630,15986801,15987158,16041278,16053197,",0,MACROD2,cmpl,cmpl,"0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1," +16482,16482,14,XM_047439848.1,chr20,-,42031889,43189906,42043543,43189733,32,"42031889,42043521,42081881,42084681,42085727,42098420,42102123,42104568,42106785,42110332,42115198,42118402,42119934,42128753,42141914,42161351,42199239,42236228,42248686,42282488,42315722,42350627,42352083,42448219,42472265,42677865,42756461,42771434,42780217,42791194,42885806,43189645,","42032336,42043639,42082017,42084845,42085853,42098552,42102297,42104718,42106921,42110487,42115315,42118500,42119971,42128830,42142002,42161539,42199388,42236258,42248822,42282525,42315996,42350730,42352285,42448329,42472562,42678159,42756636,42771550,42780299,42791466,42885932,43189906,",0,PTPRT,cmpl,cmpl,"-1,0,2,0,0,0,0,0,2,0,0,1,0,1,0,1,2,2,1,0,2,1,0,1,1,1,0,1,0,1,1,0," +21370,21370,15,XM_047439879.1,chr20,+,52972357,53495330,52973293,53487145,5,"52972357,53253498,53487143,53490865,53494873,","52973333,53256444,53487332,53490947,53495330,",0,TSHZ2,cmpl,cmpl,"0,1,1,-1,-1," +2958,2958,16,NM_001794.5,chr20,+,61252260,61940617,61252513,61936943,16,"61252260,61254825,61743562,61773002,61844667,61852753,61873727,61894909,61910421,61923450,61924333,61928189,61929608,61932984,61934055,61936736,","61252570,61254937,61743789,61773182,61844823,61852898,61873900,61895047,61910607,61923704,61924476,61928423,61929842,61933124,61934220,61940617,",0,CDH4,cmpl,cmpl,"0,0,1,0,0,0,1,0,0,0,2,1,1,1,0,0," +12919,12919,93,NM_001352595.2,chr21,+,20998408,21543329,20998563,21537957,15,"20998408,21292103,21324382,21335504,21338388,21373862,21410273,21418472,21432107,21466605,21468661,21477290,21508850,21534536,21537845,","20998618,21292241,21324500,21335665,21338534,21374013,21410461,21418569,21432281,21466725,21468783,21477471,21509055,21534656,21543329,",0,NCAM2,cmpl,cmpl,"0,1,1,2,1,0,1,0,1,1,1,0,1,2,2," +5114,5114,111,NM_001389.5,chr21,-,40010998,40847158,40013033,40846661,33,"40010998,40042370,40044077,40051957,40055724,40062868,40075036,40078686,40080151,40083907,40085601,40087169,40093720,40124194,40133853,40142557,40144490,40167217,40178926,40187130,40187890,40189041,40276096,40296054,40312080,40338100,40339118,40347669,40353464,40369098,40692809,40708453,40846618,","40013386,40042673,40044275,40052107,40055840,40062899,40075213,40078977,40080340,40084006,40085765,40087287,40093874,40124328,40134009,40142704,40144731,40167288,40179094,40187259,40187987,40189238,40276270,40296174,40312359,40338376,40339415,40347945,40353743,40369245,40692956,40708771,40847158,",0,DSCAM,cmpl,cmpl,"1,1,1,1,2,1,1,1,1,1,2,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0," +5115,5115,73,NM_001389.5,chr21_MU273391v1_fix,-,150306,986812,152341,986315,33,"150306,181704,183411,191296,195063,202213,214361,218011,219810,223565,225261,226829,233380,263864,273523,282211,284144,306871,318580,326784,327544,328695,415750,435708,451734,477754,478772,487323,493118,508752,832463,848107,986272,","152694,182007,183609,191446,195179,202244,214538,218302,219999,223664,225425,226947,233534,263998,273679,282358,284385,306942,318748,326913,327641,328892,415924,435828,452013,478030,479069,487599,493397,508899,832610,848425,986812,",0,DSCAM,cmpl,cmpl,"1,1,1,1,2,1,1,1,1,1,2,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0," +21455,21455,12,NM_001145418.2,chr22,-,27978013,28679840,27982220,28679723,23,"27978013,27985248,27989877,27990788,27992586,27993286,27996134,27998539,28001373,28014247,28030225,28094079,28096189,28098914,28101170,28105278,28107061,28163091,28296197,28297579,28306495,28629551,28679621,","27983851,27985356,27990007,27990812,27992663,27993518,27996259,27999260,28001553,28014392,28030366,28094245,28096408,28099044,28101280,28105802,28108403,28163599,28296328,28297852,28306643,28629830,28679840,",0,TTC28,cmpl,cmpl,"1,1,0,0,1,0,1,0,0,2,2,1,1,0,1,2,1,0,1,1,0,0,0," +19734,19734,104,XM_017028962.3,chr22,-,32507819,33058381,32513691,33006662,14,"32507819,32518042,32527917,32528873,32533792,32538035,32541570,32596673,32864914,32868965,32931389,32980644,33006351,33057160,","32513824,32518334,32528005,32529008,32533895,32538110,32541713,32596736,32865004,32869125,32931481,32980702,33006824,33058381,",0,SYN3,cmpl,cmpl,"2,1,0,0,2,2,0,0,0,2,0,2,0,-1," +10078,10078,1,XM_024452302.2,chr22,-,33162227,33922824,33166762,33761476,15,"33162227,33283201,33304228,33316084,33337645,33381918,33384191,33432160,33564847,33604434,33626243,33650366,33761370,33856739,33919994,","33166832,33283348,33304507,33316248,33337801,33382044,33384304,33432265,33565019,33604558,33626326,33650668,33761558,33856802,33920476,",0,LARGE1,cmpl,cmpl,"2,2,2,0,0,0,1,1,0,2,0,1,0,-1,-1," +3703,3703,75,XM_006713004.5,chr3,+,2098865,3057959,2571503,3056220,24,"2098865,2339177,2571415,2736214,2745521,2819485,2866751,2883144,2887039,2900684,2902875,2925628,2988344,3026101,3030857,3034631,3037178,3038932,3040036,3042309,3042976,3043591,3053806,3056119,","2098978,2339233,2571558,2736341,2745697,2819581,2866949,2883247,2887224,2900821,2903005,2925779,2988472,3026277,3030975,3034790,3037328,3039003,3040271,3042422,3043163,3043704,3053975,3057959,",0,CNTN4,cmpl,cmpl,"-1,-1,0,1,2,1,1,1,2,1,0,1,2,1,0,1,1,1,0,1,0,1,0,1," +7506,7506,9,NM_000844.4,chr3,+,6861114,7741533,6861388,7740406,10,"6861114,7146451,7298683,7306497,7415022,7452606,7461582,7578421,7680048,7740356,","6861907,7146668,7298825,7306652,7415163,7452807,7461722,7579357,7680295,7741533,",0,GRM7,cmpl,cmpl,"0,0,1,2,1,1,1,0,0,1," +20041,20041,89,XM_047449300.1,chr3,-,17157161,17742631,17160962,17508570,26,"17157161,17166766,17167748,17185108,17214206,17238162,17258505,17291894,17307991,17372074,17374470,17374628,17376524,17383912,17403180,17404693,17404871,17406417,17428449,17508473,17562018,17586352,17622647,17623848,17672544,17742480,","17161256,17166928,17167828,17185208,17214370,17238419,17258591,17292001,17308134,17372247,17374540,17374679,17376613,17384015,17403248,17404768,17404961,17406526,17428519,17508605,17562142,17586557,17622765,17623913,17672705,17742631,",0,TBC1D5,cmpl,cmpl,"0,0,1,0,1,2,0,1,2,0,2,2,0,2,0,0,0,2,1,0,-1,-1,-1,-1,-1,-1," +22860,22860,11,XM_017007191.2,chr3,-,21412217,22372763,21421213,22372555,10,"21416334,21423962,21425491,21436969,21441685,21510860,21564573,21664885,22168816,22372449,","21421447,21424064,21425670,21437203,21441742,21511023,21564684,21665028,22169035,22372763,",0,ZNF385D,cmpl,cmpl,"0,0,1,1,1,0,0,1,1,0," +16751,16751,1,NM_001290216.3,chr3,+,24829320,25597932,25174397,25596616,11,"24829320,24858673,25060124,25174118,25461192,25501181,25569757,25580545,25593502,25594519,25596419,","24829403,24858752,25060176,25174575,25461341,25501323,25569918,25580722,25593707,25594678,25597932,",0,RARB,cmpl,cmpl,"-1,-1,-1,0,1,0,1,0,0,1,1," +16860,16860,12,XM_017006181.2,chr3,+,29281070,30010395,29281681,30003862,13,"29281070,29434745,29488440,29587113,29739719,29762909,29868857,29884161,29897378,29936085,29988142,29991081,30003855,","29281756,29434915,29488499,29587205,29739877,29762989,29868964,29884208,29897475,29936196,29988223,29991209,30010395,",0,RBMS3,cmpl,cmpl,"0,0,2,1,0,2,1,0,2,0,0,0,2," +21785,21785,1,NM_017886.4,chr3,-,41246598,41962103,41246928,41954759,37,"41246598,41249488,41398078,41455496,41463086,41566024,41615668,41663606,41681507,41681752,41705056,41705253,41715236,41715446,41717727,41754360,41789660,41800131,41819422,41835863,41883873,41895517,41896821,41898431,41900724,41907844,41911316,41911540,41912806,41915976,41918456,41919716,41931843,41935800,41938097,41954621,41962015,","41246992,41249574,41398264,41455595,41463253,41566130,41615717,41663699,41681652,41681804,41705151,41705305,41715293,41715568,41717861,41754488,41789843,41800293,41819506,41835971,41883952,41895564,41897003,41898492,41900829,41907941,41911386,41911659,41912899,41916052,41918540,41919818,41932006,41935940,41938197,41954807,41962103,",0,ULK4,cmpl,cmpl,"2,0,0,0,1,0,2,2,1,0,1,0,0,1,2,0,0,0,0,0,2,0,1,0,0,2,1,2,2,1,1,1,0,1,0,0,-1," +5026,5026,15,XM_047447599.1,chr3,+,50674926,51384198,50675263,51381559,52,"50674926,50778674,50841674,50890025,50933980,51064447,51075355,51089242,51090229,51146548,51159243,51160554,51208773,51214121,51225648,51227282,51227981,51228660,51229511,51236344,51237489,51246725,51260155,51270814,51275078,51280105,51310231,51312003,51312475,51312843,51314979,51330137,51333157,51338358,51338934,51341236,51348851,51350287,51354881,51356088,51356406,51356961,51357757,51357960,51360510,51361858,51362526,51374468,51375747,51379448,51380124,51381049,","50675300,50778758,50841715,50890081,50934077,51064596,51075440,51089284,51090384,51146630,51159304,51160702,51208862,51214247,51225773,51227445,51228088,51228832,51229609,51236428,51237590,51246807,51260326,51271007,51275206,51280204,51310326,51312079,51312576,51312902,51315128,51330223,51333253,51338419,51339028,51341385,51348938,51350392,51355023,51356255,51356493,51357141,51357841,51358077,51360632,51361997,51362674,51374587,51375835,51379571,51380207,51384198,",0,DOCK3,cmpl,cmpl,"0,1,1,0,2,0,2,0,0,2,0,1,2,1,1,0,1,0,1,0,0,2,0,0,1,0,0,2,0,2,1,0,2,2,0,1,0,0,0,1,0,0,0,0,0,2,0,1,0,1,1,0," +2304,2304,15,NM_018398.3,chr3,+,54122551,55074557,54122713,55074206,38,"54122551,54123512,54320441,54386714,54503491,54562799,54569794,54569953,54581802,54627786,54642127,54752598,54764217,54816852,54837158,54838567,54846392,54871538,54879017,54879349,54880795,54885280,54885488,54887958,54891354,54896748,54899787,54968449,54969799,54984607,54987682,55004762,55007789,55009387,55018205,55073444,55073776,55074113,","54122835,54123594,54320558,54386774,54503654,54562931,54569855,54570104,54581877,54627876,54642241,54752677,54764351,54816870,54837230,54838648,54846467,54871622,54879089,54879411,54880863,54885326,54885586,54888052,54891450,54896870,54899868,54968511,54969844,54984670,54987753,55004838,55007842,55009443,55018317,55073557,55073859,55074557,",0,CACNA2D3,cmpl,cmpl,"0,2,0,0,0,1,1,2,0,0,0,0,1,0,0,0,0,0,0,0,2,1,2,1,2,2,1,1,0,0,0,2,0,2,1,2,1,0," +5681,5681,15,XM_047447942.1,chr3,-,55508310,56468467,55606909,56435007,18,"55508310,55606779,55699377,55734749,55888388,55950424,55992056,56007180,56010448,56018893,56080816,56139508,56148976,56170128,56173445,56296018,56434350,56468247,","55511276,55606966,55699512,55734918,55888549,55950560,55992250,56007321,56010589,56019031,56080984,56139676,56149132,56170170,56173520,56296435,56435147,56468467,",0,ERC2,cmpl,cmpl,"-1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,-1," +6362,6362,16,NM_001320900.2,chr3,-,59747276,61251452,59752225,60536962,10,"59747276,59922345,60011370,60014006,60536859,60821918,61042046,61196749,61200616,61251300,","59752321,59922414,60011400,60014152,60536979,60822011,61042099,61196843,61200665,61251452,",0,FHIT,cmpl,cmpl,"0,0,0,1,0,-1,-1,-1,-1,-1," +16470,16470,16,NM_001375471.1,chr3,+,61561570,62297609,61562287,62293307,29,"61561570,61748877,61989624,62003348,62078162,62132601,62157066,62167970,62191468,62195061,62201504,62203172,62218850,62243806,62255123,62262797,62267409,62267684,62269034,62271382,62272945,62273697,62275872,62276971,62277550,62281562,62282726,62292420,62293160,","61562372,61748982,61989804,62003497,62078258,62132668,62157224,62168163,62191653,62195170,62201554,62203950,62218983,62243898,62255215,62262894,62267492,62267819,62269169,62271555,62273081,62273844,62275966,62277048,62277679,62281709,62282869,62292556,62297609,",0,PTPRG,cmpl,cmpl,"0,1,1,1,0,0,1,0,1,0,1,0,1,2,1,0,1,0,0,0,2,0,0,1,0,0,0,2,0," +11573,11573,135,NM_001365903.2,chr3,-,65353525,66038918,65357075,66038308,25,"65353525,65359063,65361198,65363464,65364664,65364852,65375744,65379260,65381876,65383531,65387142,65391141,65401438,65429519,65430698,65437154,65439878,65442791,65448021,65453257,65470282,65478591,65493511,65621971,66037995,","65357132,65359152,65361337,65363608,65364725,65364946,65375948,65379554,65382069,65383623,65387226,65391358,65401470,65430140,65430881,65437247,65440012,65442849,65448057,65453340,65470487,65478798,65493631,65622088,66038918,",0,MAGI1,cmpl,cmpl,"0,1,0,0,2,1,1,1,0,1,1,0,1,1,1,1,2,1,1,2,1,1,1,1,0," +19860,19860,17,NM_001252216.2,chr3,+,67991543,68545621,68006626,68544503,5,"67991543,68006623,68417279,68538755,68544485,","67991804,68006744,68417420,68538880,68545621,",0,TAFA1,cmpl,cmpl,"-1,0,1,1,0," +6533,6533,17,NM_032682.6,chr3,-,70954707,71583978,70959246,71198381,21,"70954707,70965889,70970735,70972554,70976940,70977642,70977827,70987993,71000971,71015548,71041327,71046941,71052536,71053635,71112535,71198201,71299819,71359149,71493425,71581548,71583909,","70959391,70966056,70970805,70972676,70977042,70977722,70978029,70988077,71001059,71015653,71041532,71047095,71052626,71053773,71112637,71198392,71299880,71359244,71493555,71581697,71583978,",0,FOXP1,cmpl,cmpl,"2,0,2,0,0,1,0,0,2,2,1,0,0,0,0,0,-1,-1,-1,-1,-1," +17320,17320,18,NM_001128929.3,chr3,+,75906674,77649964,75937493,77646055,27,"75906674,75937480,77098013,77477413,77481098,77493243,77522774,77546337,77550817,77557943,77562650,77563166,77564953,77568312,77574498,77577489,77579946,77588750,77595141,77596622,77602209,77607797,77617512,77622226,77634869,77644703,77646053,","75906960,75937602,77098340,77477571,77481219,77493382,77522902,77546462,77550989,77558149,77562732,77563329,77565120,77568434,77574730,77577614,77580118,77588933,77595184,77596750,77602491,77607954,77617773,77622432,77635043,77644904,77649964,",0,ROBO2,cmpl,cmpl,"-1,0,1,1,0,1,2,1,0,1,0,1,2,1,0,1,0,1,1,2,1,1,2,2,1,1,1," +17319,17319,18,XM_011533977.3,chr3,-,78597238,79767998,78598912,79589911,32,"78597238,78600112,78606732,78614647,78617634,78627320,78631160,78633934,78635772,78639743,78646147,78647628,78651731,78657097,78659685,78661029,78661992,78667882,78668133,78668483,78670095,78685745,78688647,78693347,78714396,78717274,78717762,78746742,78938600,79125455,79589823,79767460,","78598927,78600309,78607041,78614800,78618041,78627569,78631305,78634042,78636108,78639898,78646190,78647655,78651929,78657269,78659807,78661261,78662114,78668049,78668302,78668565,78670301,78685917,78688772,78693356,78714524,78717413,78717883,78746900,78938927,79125539,79589961,79767998,",0,ROBO1,cmpl,cmpl,"0,1,1,1,2,2,1,1,1,2,1,1,1,0,1,0,1,2,1,0,1,0,1,1,2,1,0,1,1,1,0,-1," +2324,2324,19,NM_001381964.1,chr3,+,84958988,86074429,84959607,86066783,11,"84958988,84959589,85802046,85883290,85886189,85912372,85935766,85961468,85979163,86065604,86066664,","84959492,84959668,85802196,85883443,85886327,85912543,85935857,85961647,85979283,86065730,86074429,",0,CADM2,cmpl,cmpl,"-1,0,1,1,1,1,1,2,1,1,1," +5626,5626,20,XM_017006214.2,chr3,+,96814593,97761532,96866864,97748701,18,"96814593,96866824,97226263,97243951,97405149,97448124,97448567,97475351,97479293,97483933,97532357,97592611,97610792,97637872,97720260,97735924,97747422,97748586,","96815008,96866889,97226419,97244287,97405274,97448172,97448730,97475460,97479364,97484059,97532543,97592737,97610854,97638082,97720410,97736118,97747572,97761532,",0,EPHA6,cmpl,cmpl,"-1,0,1,1,1,0,0,1,2,1,1,1,1,0,0,0,2,2," +22471,22471,182,NM_001348802.3,chr3,-,114314499,115147288,114339004,114351858,11,"114314499,114350273,114380216,114389004,114500351,114693527,114801100,114900303,114974365,115144812,115147218,","114339426,114351878,114380404,114389105,114500391,114693575,114801174,114900342,114974416,115144907,115147288,",0,ZBTB20,cmpl,cmpl,"1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1," +11321,11321,22,NM_001318915.2,chr3,-,115802373,116445487,115810316,116445031,9,"115802373,115816613,115834561,115841844,115842457,115852482,116019514,116086323,116444876,","115810414,115816646,115834597,115841993,115842578,115852617,116019640,116086556,116445487,",0,LSAMP,cmpl,cmpl,"1,1,1,2,1,1,1,2,0," +8998,8998,191,XM_006713813.4,chr3,+,124033368,124726325,124033740,124719470,60,"124033368,124227989,124234828,124264497,124268742,124298790,124325979,124329860,124334264,124347142,124384844,124395134,124398696,124413469,124422811,124430655,124434333,124438887,124441944,124446160,124446762,124455176,124456609,124461889,124462523,124474662,124477244,124482807,124488203,124490693,124491322,124492739,124496310,124562842,124632419,124633848,124637207,124650807,124655600,124657447,124657733,124658430,124659364,124660922,124661850,124666448,124667011,124671659,124674360,124678189,124679457,124693803,124694331,124696133,124697592,124699868,124702037,124712934,124717246,124718924,","124033813,124228064,124234943,124264690,124269255,124298913,124326171,124329992,124334495,124347265,124385036,124395304,124398871,124413665,124422978,124430775,124434525,124439037,124442059,124446276,124446885,124455359,124456728,124461956,124462633,124474732,124477334,124482900,124488315,124490884,124491424,124492882,124496413,124563089,124632703,124633953,124637303,124650938,124655667,124657551,124657803,124658517,124659457,124660973,124661928,124666634,124667183,124671898,124674614,124678313,124679517,124693831,124694503,124696255,124697724,124700033,124702116,124713135,124717385,124726325,",0,KALRN,cmpl,cmpl,"0,1,1,2,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,1,0,0,0,2,0,2,0,0,0,1,0,0,2,0,1,0,0,0,2,0,2,0,0,0,0,0,0,1,0,2,0,0,1,2,1,1,1,2,2,0," +3594,3594,25,NM_022131.3,chr3,+,139935184,140577397,139935374,140566253,17,"139935184,140175950,140403628,140404557,140421124,140448518,140459520,140466609,140532323,140546514,140556512,140558639,140562137,140562810,140563079,140563960,140566052,","139935483,140176073,140403824,140404766,140421274,140448704,140459769,140466731,140532486,140546681,140556661,140558857,140562308,140562956,140563203,140564145,140577397,",0,CLSTN2,cmpl,cmpl,"0,1,1,2,1,1,1,1,0,1,0,2,1,1,0,1,0," +18750,18750,26,NM_173653.4,chr3,-,143265221,143848468,143266701,143848322,16,"143265221,143268874,143363483,143382059,143467036,143493652,143495334,143552361,143574087,143578584,143652254,143693191,143795000,143796825,143832018,143848147,","143266929,143268980,143363563,143382114,143467190,143493764,143495448,143552450,143574193,143578723,143652360,143693307,143795077,143796903,143832221,143848468,",0,SLC9A9,cmpl,cmpl,"0,2,0,2,1,0,0,1,0,2,1,2,0,0,1,0," +8814,8814,3,NM_001197114.2,chr3,+,159069318,159897359,159069432,159896774,10,"159069318,159245842,159262547,159764442,159866162,159886206,159887707,159888819,159892096,159896722,","159069441,159245907,159262683,159765138,159866281,159886324,159887905,159888943,159892190,159897359,",0,IQCJ-SCHIP1,cmpl,cmpl,"0,0,2,0,0,2,0,0,1,2," +17814,17814,3,NM_001197108.2,chr3,+,159273243,159897359,159273820,159896774,7,"159273243,159866162,159886206,159887707,159888819,159892096,159896722,","159273883,159866281,159886324,159887905,159888943,159892190,159897359,",0,SCHIP1,cmpl,cmpl,"0,0,2,0,0,1,2," +11892,11892,234,XM_005247213.4,chr3,-,169083506,169663712,169084908,169663372,17,"169083506,169088999,169089999,169092957,169095075,169100884,169102059,169107925,169112786,169115382,169121055,169122579,169127843,169131428,169143697,169381186,169663335,","169085043,169089183,169090236,169093102,169095245,169100962,169102226,169107952,169112874,169116739,169121212,169122727,169128060,169131531,169143832,169381524,169663712,",0,MECOM,cmpl,cmpl,"0,2,2,1,2,2,0,0,2,1,0,2,1,0,0,1,0," +13299,13299,29,XM_047447712.1,chr3,+,173395951,174294372,173604598,174281303,6,"173395951,173604278,173807679,174275314,174278860,174280480,","173396227,173605091,173807832,174275500,174279650,174294372,",0,NLGN1,cmpl,cmpl,"-1,0,1,1,1,2," +12817,12817,29,XM_017006080.3,chr3,+,174440981,175810548,175096797,175803203,15,"174440981,174737640,175096789,175233930,175256410,175324174,175447228,175463400,175466978,175471638,175576040,175627290,175737305,175755219,175803004,","174441032,174737746,175097291,175234204,175256530,175324325,175447372,175463493,175467184,175471758,175576187,175627386,175737399,175755418,175810548,",0,NAALADL2,cmpl,cmpl,"-1,-1,0,2,0,0,1,1,1,0,0,0,0,1,2," +11178,11178,31,NM_001387672.1,chr3,+,188153020,188890671,188406120,188874479,10,"188153020,188406111,188484591,188524664,188609160,188708266,188760112,188866199,188872642,188874350,","188153363,188406313,188484704,188524787,188609844,188708393,188760282,188866378,188872763,188890671,",0,LPP,cmpl,cmpl,"-1,0,1,0,0,0,1,0,2,0," +6322,6322,256,NM_004113.6,chr3,-,192139389,192727541,192144008,192727193,6,"192139389,192170457,192335360,192360427,192727180,192727483,","192144127,192170656,192335464,192360538,192727323,192727541,",0,FGF12,cmpl,cmpl,"1,0,1,1,0,-1," +19052,19052,9,NM_020777.3,chr4,+,7192537,7742827,7192646,7740264,27,"7192537,7396287,7531529,7638327,7654133,7661499,7664352,7667123,7676049,7682742,7689485,7697197,7703279,7704176,7712732,7714239,7715182,7718011,7723696,7725153,7726779,7728349,7729586,7733321,7734271,7737068,7740199,","7193126,7396355,7531629,7638492,7654207,7661564,7664471,7667213,7676229,7682889,7689588,7697274,7703371,7704284,7712853,7714373,7715311,7718183,7723883,7725287,7726903,7728462,7729712,7733421,7734374,7737172,7742827,",0,SORCS2,cmpl,cmpl,"0,0,2,0,0,2,1,0,0,0,0,1,0,2,2,0,2,2,0,1,0,1,0,0,1,2,1," +9084,9084,11,NM_147182.4,chr4,-,20728605,21948772,20730081,20850644,9,"20728605,20732005,20732680,20734627,20749661,20758820,20850542,21762920,21948570,","20730129,20732068,20732785,20734735,20749732,20758890,20850667,21763032,21948772,",0,KCNIP4,cmpl,cmpl,"0,0,0,0,1,0,0,-1,-1," +15734,15734,11,NM_001330751.2,chr4,-,23792020,24472905,23795821,24091536,15,"23792020,23801729,23802223,23812746,23813020,23813689,23824279,23824462,23828399,23829462,23831556,23884751,24091467,24258202,24472374,","23795925,23801881,23802345,23812867,23813125,23814605,23824353,23824508,23828604,23829585,23831751,23884931,24091635,24258267,24472905,",0,PPARGC1A,cmpl,cmpl,"1,2,0,2,2,1,2,1,0,0,0,0,0,-1,-1," +388,388,16,NM_001387542.1,chr4,+,61200325,62078335,61497293,62070908,22,"61200325,61497120,61587226,61676825,61732753,61813808,61892655,61895730,61909559,61912718,61934839,61935922,61946913,61948099,61979679,61983382,61996290,61998173,62031441,62037730,62044452,62070108,","61201765,61497348,61587440,61676935,61733554,61813889,61892958,61895834,61909745,61912757,61935023,61936045,61947122,61948276,61979772,61983603,61996357,61998265,62031610,62037856,62044549,62078335,",0,ADGRL3,cmpl,cmpl,"-1,0,1,2,1,1,1,1,0,0,0,1,1,0,0,0,2,0,2,0,0,1," +3204,3204,18,XM_047449975.1,chr4,+,80321264,80963750,80321434,80963612,8,"80321272,80335743,80362753,80583092,80608332,80869992,80944809,80963516,","80321463,80335879,80362884,80583183,80608383,80870135,80944939,80963750,",0,CFAP299,cmpl,cmpl,"0,2,0,2,0,0,2,0," +1049,1049,19,NM_001025616.3,chr4,+,85475149,86002666,85570541,86000722,10,"85475149,85570521,85721884,85923647,85942065,85972035,85974887,85977569,85994582,86000478,","85475559,85570721,85721972,85923770,85942273,85972168,85974961,85977691,85995657,86002666,",0,ARHGAP24,cmpl,cmpl,"-1,0,0,1,1,2,0,2,1,2," +11673,11673,155,XM_047415964.1,chr4,-,86010404,86594074,86017227,86593945,13,"86010404,86029196,86031367,86064265,86067772,86098523,86101051,86101893,86103185,86107222,86159297,86194335,86593909,","86017370,86029274,86031431,86064390,86067955,86098595,86101217,86102032,86103244,86107352,86159467,86194407,86594074,",0,MAPK10,cmpl,cmpl,"1,1,0,1,1,1,0,2,0,2,0,0,0," +2782,2782,19,XM_047415679.1,chr4,+,90127393,91605295,90308284,91599057,10,"90127393,90308243,90312862,90400035,90468233,90628024,90723913,90815761,91085949,91598571,","90127831,90309608,90313047,90400129,90468354,90628232,90723991,90815845,91085994,91605295,",0,CCSER1,cmpl,cmpl,"-1,0,1,0,1,2,0,0,0,0," +7471,7471,20,XM_047450133.1,chr4,+,92303965,93810456,92304656,93806873,16,"92303965,92590130,93084994,93110747,93207403,93216737,93224613,93238370,93395606,93422770,93455661,93490638,93515215,93626268,93769209,93806714,","92304744,92590286,93085279,93110953,93207457,93216911,93224775,93238490,93395708,93422968,93455974,93490777,93515411,93626435,93769450,93810456,",0,GRID2,cmpl,cmpl,"0,1,1,1,0,0,0,0,0,0,0,1,2,0,2,0," +19589,19589,20,XM_011531886.4,chr4,-,97441248,98143476,97448750,98143150,14,"97441248,97448739,97712698,97840772,97943896,97972279,97981158,97986885,98105952,98109192,98128427,98132444,98134346,98143041,","97441534,97448888,97712814,97840932,97944007,97972440,97981318,97986993,98106064,98109305,98128592,98132603,98134459,98143476,",0,STPG2,cmpl,cmpl,"-1,0,1,0,0,1,0,0,2,0,0,0,1,0," +742,742,22,NM_001386187.2,chr4,+,112705621,113383736,112706145,113381471,43,"112705621,113174415,113196367,113199010,113232160,113236986,113240484,113242110,113249763,113255734,113258049,113258312,113264896,113274451,113277836,113278459,113282674,113287604,113288387,113292415,113293439,113302766,113311254,113317706,113318516,113330245,113333053,113335845,113336576,113339225,113341687,113343016,113345899,113348275,113350227,113360822,113363337,113365038,113367565,113369513,113373089,113373284,113381456,","112706217,113174517,113196466,113199109,113232259,113237172,113240583,113242209,113249862,113255932,113258148,113258411,113264995,113274649,113277935,113278558,113282872,113287703,113288486,113292514,113293538,113302839,113311399,113317809,113318620,113330470,113333208,113336057,113336781,113339322,113341916,113343142,113346022,113348308,113350249,113360897,113363469,113365182,113367851,113369805,113373173,113373449,113383736,",0,ANK2,cmpl,cmpl,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,2,2,1,0,1,2,0,0,0,0,1,1,1,1,2,0,0,0," +8737,8737,3,XM_047416356.1,chr4,-,142023159,142846535,142028781,142431259,27,"142023159,142082030,142086143,142108092,142112541,142122127,142123291,142124587,142145839,142160357,142173631,142193086,142208424,142208895,142237863,142260491,142270662,142305457,142314711,142402937,142405205,142429172,142431168,142462662,142725838,142754013,142846208,","142028914,142082185,142086256,142108190,142112682,142122245,142123415,142124760,142145996,142160561,142173809,142193195,142208529,142209026,142238011,142260564,142270774,142305537,142314762,142403054,142405324,142429217,142431385,142462726,142725901,142754132,142846535,",0,INPP4B,cmpl,cmpl,"2,0,1,2,2,1,0,1,0,0,2,1,1,2,1,0,2,0,0,0,1,1,0,-1,-1,-1,-1," +11183,11183,3,XM_005263373.4,chr4,-,150264434,151015727,150265721,151014642,58,"150264434,150277852,150282449,150285932,150302624,150310228,150315560,150321190,150325808,150349991,150415437,150435588,150436723,150467672,150471623,150487731,150490917,150588047,150590712,150597082,150599006,150683550,150735257,150761782,150798080,150806270,150808319,150817123,150828179,150831816,150844099,150844657,150848817,150849421,150850723,150851884,150867670,150868181,150870524,150871344,150872662,150893051,150896393,150897738,150900048,150905837,150906296,150908333,150908659,150914194,150915607,150916400,150916616,150921197,150928515,150928833,151014426,151015472,","150265812,150278004,150282646,150286034,150302792,150310384,150315623,150321383,150325898,150350159,150415590,150435708,150436864,150467785,150471739,150487834,150491035,150588184,150590859,150597115,150599131,150683717,150735366,150761847,150798142,150806404,150808398,150817257,150828621,150831976,150844207,150844779,150848998,150849575,150850902,150852943,150867863,150868305,150870606,150871453,150872755,150893149,150896456,150897818,150900217,150905990,150906405,150908467,150908857,150914341,150915727,150916527,150916738,150921293,150928616,150929065,151014861,151015727,",0,LRBA,cmpl,cmpl,"2,0,1,1,1,1,1,0,0,0,0,0,0,1,2,1,0,1,1,1,2,0,2,0,1,2,1,2,1,0,0,1,0,2,0,0,2,1,0,2,2,0,0,1,0,0,2,0,0,0,0,2,0,0,1,0,0,-1," +6605,6605,28,NM_020116.5,chr4,-,161383896,162164000,161385746,162111396,16,"161383896,161455003,161459211,161481019,161500015,161510397,161538165,161542531,161587454,161656327,161759410,161775877,161920403,162033624,162111270,162163614,","161386449,161455128,161459319,161481169,161500134,161510424,161538300,161542693,161587575,161656494,161759531,161776074,161920652,162033658,162111412,162164000,",0,FSTL5,cmpl,cmpl,"2,0,0,0,1,1,1,1,0,1,0,1,1,0,0,-1," +11701,11701,28,XM_017008335.2,chr4,-,163524297,164384019,163528747,163988599,8,"163524297,163545595,163585748,163613313,163700812,163988500,164111587,164383186,","163529046,163545743,163585929,163613393,163700863,163988709,164111662,164384019,",0,MARCHF1,cmpl,cmpl,"1,0,2,0,0,0,-1,-1," +6768,6768,29,NM_001034845.3,chr4,+,171813403,173041559,171814580,173040100,13,"171813403,171814411,172229655,172311613,172348522,172809360,172813539,172882789,172931160,172952036,173009177,173021475,173039932,","171813990,171814718,172229764,172311752,172348689,172809546,172813723,172882907,172931268,172952258,173009294,173021625,173041559,",0,GALNTL6,cmpl,cmpl,"-1,0,0,1,2,1,1,2,0,0,0,0,0," +20213,20213,30,XM_047415944.1,chr4,+,181447612,182803024,182324020,182800351,30,"181447612,181739486,181867751,182041663,182323945,182600923,182628650,182653770,182673004,182679665,182680247,182680542,182681813,182688165,182714086,182728964,182730199,182730877,182736807,182738400,182743169,182751799,182753449,182754384,182773471,182774917,182789092,182792273,182796636,182799595,","181447772,181739649,181867832,182041743,182324252,182601161,182628889,182653893,182673219,182679876,182680349,182680737,182682014,182688351,182714233,182729181,182730319,182731139,182737075,182738544,182743419,182752032,182753604,182755259,182773647,182775153,182789389,182793885,182796767,182803024,",0,TENM3,cmpl,cmpl,"-1,-1,-1,-1,0,1,2,1,1,0,1,1,1,1,1,1,2,2,0,1,1,2,1,0,2,1,0,0,1,0," +4205,4205,10,XM_005248251.4,chr5,-,10971835,11904446,10973452,11903853,23,"10971835,10981772,10988110,10992550,11017973,11022768,11082695,11083880,11098574,11110857,11117449,11159575,11199447,11236690,11346371,11364695,11384664,11397030,11411535,11412034,11564943,11732135,11903816,","10973713,10981846,10988242,10992677,11018058,11022979,11082846,11083955,11098748,11111043,11117567,11159759,11199661,11236823,11346627,11364890,11385229,11397203,11411652,11412069,11565056,11732272,11904446,",0,CTNND2,cmpl,cmpl,"0,1,1,0,2,1,0,0,0,0,2,1,0,2,1,1,0,1,1,2,0,1,0," +2949,2949,11,NM_001349558.2,chr5,-,19471295,20575713,19473225,19838986,16,"19471295,19483300,19502991,19520656,19543868,19571578,19591056,19612433,19721346,19746941,19838758,19981059,19991872,20255443,20338091,20575461,","19473716,19483552,19503109,19520778,19544005,19571832,19591244,19612601,19721466,19747236,19839242,19981178,19992014,20255505,20338239,20575713,",0,CDH18,cmpl,cmpl,"1,1,0,1,2,0,1,1,1,0,0,-1,-1,-1,-1,-1," +2944,2944,11,XM_047416602.1,chr5,-,21750672,22853344,21751736,22078676,17,"21750672,21755590,21760557,21764977,21783357,21802166,21816944,21842160,21854670,21975090,22078445,22172340,22212497,22405256,22505269,22810616,22853057,","21752236,21755842,21760675,21765099,21783494,21802420,21817132,21842328,21854790,21975385,22078862,22172409,22212643,22405351,22505364,22810709,22853344,",0,CDH12,cmpl,cmpl,"1,1,0,1,2,0,1,1,1,0,0,-1,-1,-1,-1,-1,-1," +14958,14958,16,XM_024446110.2,chr5,-,58969037,60522128,58974663,60185598,18,"58969037,58975656,58976349,58977190,58988492,58989754,58990803,58991831,58993371,59038858,59180594,59185188,59193499,59215776,59988487,60147747,60185556,60522050,","58975080,58975839,58976472,58977345,58988592,58989919,58990902,58992004,58993465,59038971,59180644,59185262,59193536,59215968,59988717,60147894,60185687,60522128,",0,PDE4D,cmpl,cmpl,"0,0,0,1,0,0,0,1,0,1,2,0,2,2,0,0,0,-1," +11745,11745,0,NM_001393525.1,chr5,+,66596392,67169593,66596655,67167051,28,"66596392,66759708,66788669,66899950,67054403,67090161,67095596,67100434,67102535,67104365,67110097,67114086,67118681,67121016,67130209,67131812,67133513,67134522,67136562,67142114,67142420,67144668,67145143,67152636,67153457,67160455,67162606,67163146,","66597018,66759862,66788794,66899982,67054492,67090231,67095675,67100592,67102611,67104575,67110199,67114219,67118749,67121102,67130418,67131951,67133646,67134688,67136664,67142237,67142533,67144796,67145379,67152866,67153580,67160592,67162788,67169593,",0,MAST4,cmpl,cmpl,"0,0,1,0,2,1,2,0,2,0,0,0,1,0,2,1,2,0,1,1,1,0,2,1,0,0,2,1," +390,390,159,XM_017009967.2,chr5,+,90558796,91164437,90558895,91163900,89,"90558796,90614834,90617803,90622596,90625129,90627210,90628561,90629209,90635113,90637724,90642635,90642855,90643802,90644705,90645967,90647497,90651603,90652345,90653208,90657904,90672545,90674053,90675242,90676079,90679548,90681314,90683585,90685779,90689860,90690796,90692604,90693889,90696936,90703664,90704388,90705378,90706230,90708815,90710980,90711183,90712286,90716466,90720047,90720934,90724831,90725085,90725548,90728668,90729641,90745045,90745590,90750550,90753573,90754982,90756453,90756978,90759408,90763304,90774185,90776452,90777904,90778426,90778864,90781429,90783123,90783837,90788070,90789701,90790872,90802738,90805283,90807601,90810232,90815618,90823424,90828943,90840577,90848636,90853283,90854061,90855740,90863756,90965414,90985343,91072446,91102218,91150029,91153220,91163781,","90558917,90615019,90617953,90622701,90625243,90627776,90628832,90629539,90635290,90637948,90642762,90643041,90643983,90644869,90646091,90647764,90651730,90652563,90653952,90658278,90672722,90674234,90675445,90676209,90679629,90681454,90684195,90685995,90690076,90691041,90692786,90694701,90697146,90703795,90704488,90705579,90706394,90708909,90711059,90711322,90712428,90716729,90720223,90721059,90724989,90725232,90725656,90728933,90729764,90745265,90745795,90750697,90753829,90755185,90756630,90757161,90759588,90763469,90774303,90776576,90778043,90778609,90779097,90781578,90783325,90784057,90788310,90789851,90791346,90802882,90805458,90807737,90811338,90815736,90823596,90829186,90840985,90848821,90853533,90854201,90855901,90863857,90965531,90985522,91072604,91102340,91150221,91153398,91164437,",0,ADGRV1,cmpl,cmpl,"0,1,0,0,0,0,2,0,0,0,2,0,0,1,0,1,1,2,1,1,0,0,1,0,1,1,0,1,1,1,0,2,1,1,0,1,1,0,1,2,0,1,0,2,1,0,0,0,1,1,2,0,0,1,0,0,0,0,0,1,2,0,0,2,1,2,0,0,0,0,0,1,2,1,2,0,0,0,2,0,2,1,0,0,2,1,0,0,1," +11854,11854,163,NM_024717.7,chr5,-,94703689,95285094,94707495,95284575,23,"94703689,94708511,94710817,94714776,94779109,94799012,94868332,94870416,94870871,94871314,94873138,94888878,94894648,94909250,94912805,94917895,94923961,94931952,94940083,94942347,94953218,95017366,95283855,","94707567,94708609,94710927,94714886,94779163,94799132,94868452,94870491,94870973,94871417,94873241,94888972,94894835,94909381,94912976,94917973,94924021,94931991,94940195,94942427,94953361,95017484,95285094,",0,MCTP1,cmpl,cmpl,"0,1,2,0,0,0,0,0,0,2,1,0,2,0,0,0,0,0,2,0,1,0,0," +2462,2462,20,NM_001423255.1,chr5,+,95961428,96774683,96722677,96771679,31,"95961428,96379565,96675538,96695835,96722638,96727488,96729152,96729611,96730779,96736171,96737848,96740037,96741265,96741493,96742654,96746341,96747344,96748517,96750586,96754059,96754657,96757443,96757582,96762273,96765220,96766052,96767437,96767906,96770530,96771643,96772639,","95961586,96379652,96675601,96695907,96722698,96727530,96729209,96729725,96730860,96736240,96737947,96740118,96741358,96741580,96742756,96746425,96747392,96748613,96750682,96754161,96754741,96757494,96757654,96762372,96765325,96766145,96767482,96767999,96770602,96771702,96774683,",0,CAST,cmpl,cmpl,"-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1," +6175,6175,21,NM_001163315.3,chr5,-,107859034,108382098,107861719,108381691,9,"107859034,107881036,108020924,108186116,108224120,108348398,108364737,108367830,108380698,","107861860,107881179,108021001,108186247,108224228,108348530,108364995,108367953,108382098,",0,FBXL17,cmpl,cmpl,"0,1,2,0,0,0,0,0,0," +6604,6604,200,XM_011543283.2,chr5,-,133196454,133842067,133199094,133603983,16,"133196454,133201932,133210190,133217228,133220747,133224189,133225149,133225657,133233416,133249409,133312653,133316458,133400737,133567185,133603857,133841951,","133199797,133202042,133210298,133217378,133220866,133224216,133225284,133225819,133233537,133249576,133312777,133316652,133400986,133567219,133603993,133842067,",0,FSTL4,cmpl,cmpl,"2,0,0,0,1,1,1,1,0,1,0,1,1,0,0,-1," +19287,19287,25,NM_004598.4,chr5,-,136975297,137499326,136978653,137498558,11,"136975297,136979331,136985139,136988421,136992483,137067714,137112434,137140579,137267009,137498372,137499178,","136978844,136979469,136985202,136988643,136992600,137067829,137112561,137140694,137267055,137498558,137499326,",0,SPOCK1,cmpl,cmpl,"1,1,1,1,1,0,2,1,0,0,-1," +18143,18143,27,XM_047417519.1,chr5,+,155727831,156767788,156329576,156759390,11,"155727831,155870343,156123855,156329533,156344488,156508600,156589230,156594931,156647463,156757580,156759216,","155728990,155870424,156124019,156329579,156344677,156508702,156589318,156595051,156647536,156757704,156767788,",0,SGCD,cmpl,cmpl,"-1,-1,-1,0,0,0,0,1,1,2,0," +20212,20212,3,XM_047417419.1,chr5,+,166979028,168264157,167284837,168262810,29,"166979028,167073241,167284763,167375197,167952587,167992943,168047426,168062059,168090573,168098025,168118291,168124849,168126753,168162610,168190336,168195175,168198852,168199863,168203688,168204371,168215039,168216767,168218124,168226087,168227894,168244419,168246756,168260282,168262048,","166979523,167073356,167285063,167375473,167952822,167993182,168047549,168062265,168090769,168098127,168118486,168125050,168126966,168162757,168190547,168195295,168199114,168200131,168203832,168204621,168215272,168216922,168218999,168226263,168228130,168244716,168248371,168260413,168264157,",0,TENM2,cmpl,cmpl,"-1,-1,0,1,1,2,1,1,0,1,1,1,1,1,1,2,2,0,1,1,2,1,0,2,1,0,0,1,0," +18780,18780,29,NM_001271946.2,chr5,-,168661739,169301139,168666453,169300709,36,"168661739,168669782,168671197,168673176,168683965,168685686,168686978,168692606,168696291,168700581,168707975,168710894,168712282,168722255,168722932,168724415,168748301,168749471,168752954,168753863,168760861,168762538,168772780,168774234,168785906,168789559,168795506,168806445,168817299,168823259,168844583,168883264,169193478,169244704,169251387,169300512,","168666689,168669991,168671483,168673331,168684096,168685927,168687116,168692700,168696431,168700679,168708121,168711058,168712354,168722327,168723004,168724484,168748434,168749635,168753098,168754007,168760936,168762689,168772944,168774378,168785978,168789631,168795578,168806587,168817463,168823331,168844655,168883336,169193550,169244776,169251459,169301139,",0,SLIT3,cmpl,cmpl,"1,2,1,2,0,2,2,1,2,0,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,0," +7075,7075,9,NM_001500.4,chr6,-,1623805,2245605,1624168,2245422,11,"1623805,1624471,1726415,1742467,1930102,1959866,1960773,2115770,2117468,2124686,2245320,","1624231,1624540,1726512,1742586,1930230,1959971,1960966,2115880,2117556,2124731,2245605,",0,GMDS,cmpl,cmpl,"0,0,2,0,1,1,0,1,0,0,0," +15141,15141,85,NM_001322308.3,chr6,+,12716766,13287837,12718744,13287078,14,"12716766,12718698,12749643,13053364,13160203,13182518,13205814,13227815,13230036,13272859,13278267,13283421,13286145,13287062,","12716896,12718847,12749790,13053529,13160284,13182686,13206136,13228063,13230193,13272915,13278329,13283562,13286222,13287837,",0,PHACTR1,cmpl,cmpl,"-1,0,1,1,1,1,1,2,1,2,1,0,0,2," +3004,3004,11,XM_011514718.1,chr6,+,20534456,21232404,20546350,21231039,15,"20534456,20535350,20546345,20548592,20649292,20739518,20758594,20781144,20846074,20955418,21000226,21065047,21198020,21201109,21230847,","20534574,20535394,20546523,20548705,20649377,20739615,20758643,20781265,20846178,20955585,21000372,21065228,21198104,21201274,21232404,",0,CDKAL1,cmpl,cmpl,"-1,-1,0,2,1,2,0,1,2,1,0,2,0,0,0," +15616,15616,14,NM_001318876.2,chr6,+,43517088,44461400,43517109,44461379,11,"43517088,43517305,43519332,43519705,43520065,43520274,43520624,43520931,43529248,44441889,44461319,","43517178,43517377,43519440,43519838,43520185,43520427,43520774,43521048,43529271,44441913,44461400,",0,POLR1C,cmpl,cmpl,"0,0,0,0,1,1,1,1,1,0,0," +19680,19680,14,NM_181356.3,chr6,-,44826725,45377934,44829815,45322916,13,"44826725,44932652,44953309,44954494,44961752,45003652,45014800,45020545,45105921,45321795,45322878,45365200,45377767,","44829857,44932763,44953417,44954607,44961828,45003792,45014891,45020632,45106006,45321891,45322967,45365301,45377934,",0,SUPT3H,cmpl,cmpl,"0,0,0,1,0,1,0,0,2,2,0,-1,-1," +9203,9203,16,NM_001350622.2,chr6,-,61679960,62286225,61680962,62285948,10,"61679960,61697194,61732681,61816954,61894634,61901243,61978065,62047877,62177184,62285857,","61681060,61697253,61732764,61817005,61894833,61901371,61978212,62047994,62177312,62286225,",0,KHDRBS2,cmpl,cmpl,"1,2,0,0,2,0,0,0,1,0," +5841,5841,16,NM_001292009.2,chr6,-,63719979,65707226,63720595,65495410,44,"63719979,63726518,63741912,63762460,63778005,63788104,63789057,63806189,63864185,63984382,63999074,64066337,64081855,64230591,64306969,64388689,64436173,64439161,64590222,64593116,64617417,64626120,64813377,64821644,64822650,64886696,64902112,64902403,64912483,64945792,64997581,65057613,65295862,65334979,65344037,65353457,65384385,65402477,65405173,65490593,65494662,65495858,65639777,65707134,","63721797,63726680,63741975,63762633,63778180,63788249,63789224,63806372,63864358,63984603,63999183,64066491,64082002,64230824,64307082,64388840,64436265,64439352,64591989,64593309,64617533,64626245,64813577,64821723,64822822,64886842,64902220,64902500,64912743,64945914,64997703,65057727,65296119,65335146,65344177,65353617,65384500,65402605,65405367,65490707,65495607,65495993,65639892,65707226,",0,EYS,cmpl,cmpl,"1,1,1,2,1,0,1,1,2,0,2,1,1,2,0,2,0,1,1,0,1,2,0,2,1,2,2,1,2,0,1,1,2,0,1,0,2,0,1,1,0,-1,-1,-1," +367,367,17,NM_001704.3,chr6,+,68635281,69389506,68638675,69388891,32,"68635281,68637390,68638660,68930558,68936518,68943829,68956023,68956644,68974762,68975233,68993767,69014037,69018390,69048184,69049270,69062933,69075994,69233289,69235031,69239123,69324871,69327819,69330505,69332922,69338915,69339332,69354232,69355820,69360868,69372405,69382830,69388702,","68636000,68637562,68639432,68930669,68936680,68943994,68956188,68956809,68974864,68975340,68993962,69014106,69018499,69048334,69049346,69063036,69076038,69233416,69235135,69239226,69325022,69327889,69330572,69333008,69339014,69339504,69354328,69355860,69361512,69372441,69382935,69389506,",0,ADGRB3,cmpl,cmpl,"-1,-1,0,1,1,1,1,1,1,1,0,0,0,1,1,2,0,2,0,2,0,1,2,0,2,2,0,0,1,0,0,0," +9132,9132,142,XM_047419077.1,chr6,+,72622063,73198853,72622189,73195414,12,"72622063,73003907,73041935,73077321,73077761,73111307,73120482,73133420,73169745,73190572,73192564,73194451,","72622587,73003998,73042062,73077497,73077887,73111403,73120577,73133641,73169854,73190704,73192691,73198853,",0,KCNQ5,cmpl,cmpl,"0,2,0,1,0,0,0,2,1,2,2,0," +7475,7475,21,NM_021956.5,chr6,+,101393707,102070083,101399277,102068511,17,"101393707,101398984,101621948,101626379,101676622,101682552,101686179,101799647,101802330,101818369,101859286,101889639,101924600,101928414,102035340,102055329,102068346,","101393837,101399392,101622116,101626637,101676804,101682606,101686353,101799791,101802438,101818483,101859493,101889863,101924719,101928632,102035566,102055580,102070083,",0,GRIK2,cmpl,cmpl,"-1,0,1,1,1,0,0,0,0,0,0,0,2,1,0,1,0," +13268,13268,23,XM_017010318.3,chr6,+,123803864,124825640,123804200,124823229,6,"123803864,124283004,124658185,124791338,124818386,124823219,","123804254,124283142,124658386,124791399,124818468,124825640,",0,NKAIN2,cmpl,cmpl,"0,0,0,0,1,2," +16473,16473,195,NM_001291981.2,chr6,-,127968784,128520599,127970226,128520358,33,"127968784,127973021,127973663,127976656,127976922,127981115,127982830,127983241,127985720,127990768,127991293,127992672,127995248,127995461,127996900,127998719,128001195,128003196,128005083,128009129,128064757,128067518,128078812,128082436,128083714,128089689,128184431,128218921,128240034,128242520,128322038,128397565,128520258,","127970280,127973157,127973827,127976782,127977054,127981289,127982980,127983377,127985875,127990885,127991391,127992709,127995266,127995538,127996988,127998904,128001207,128003232,128005244,128009268,128064794,128067792,128078918,128082638,128083824,128089992,128184725,128219096,128240150,128242602,128322310,128397688,128520599,",0,PTPRK,cmpl,cmpl,"0,2,0,0,0,0,0,2,0,0,1,0,0,1,0,1,1,1,2,1,0,2,1,0,1,1,1,0,1,0,1,1,0," +10051,10051,24,NM_001079823.2,chr6,+,128883137,129516566,128883245,129516347,64,"128883137,129049917,129059783,129098172,129143900,129146958,129148978,129154504,129165575,129177705,129190204,129192679,129250111,129252083,129260710,129267105,129270623,129280060,129287846,129291613,129297684,129300735,129312860,129314654,129315475,129315761,129316037,129320537,129328277,129342342,129349297,129353163,129366218,129369891,129383121,129391490,129393044,129401223,129402323,129403820,129427751,129438645,129440815,129443062,129445666,129452987,129454154,129456334,129460199,129464289,129465144,129473213,129478692,129481262,129486473,129491900,129492314,129502658,129503090,129505199,129507488,129512362,129514372,129516189,","128883357,129050088,129059896,129098415,129144080,129147048,129149096,129154683,129165675,129177866,129190345,129192853,129250213,129252295,129260822,129267219,129270751,129280147,129288058,129291720,129297865,129300872,129313097,129314798,129315655,129315950,129316171,129320655,129328412,129342467,129349384,129353357,129366361,129369990,129383233,129391653,129393255,129401340,129402487,129403959,129427854,129438762,129440998,129443068,129445821,129453131,129454288,129456494,129460324,129464452,129465289,129473352,129478813,129481439,129486622,129492077,129492483,129502771,129503280,129505355,129507642,129512493,129514595,129516566,",0,LAMA2,cmpl,cmpl,"0,1,1,0,0,0,0,1,0,1,0,0,0,0,2,0,0,2,2,1,0,1,0,0,0,0,0,2,0,0,2,2,1,0,0,1,2,0,0,2,0,1,1,1,1,0,0,2,0,2,0,1,2,0,0,2,2,0,2,0,0,1,0,1," +21962,21962,26,NM_007124.3,chr6,+,144285334,144853034,144291828,144850997,75,"144285334,144291736,144403122,144421877,144423548,144423985,144426286,144428777,144429580,144435934,144437564,144438744,144440351,144444280,144447210,144447601,144448599,144451369,144453781,144458769,144459173,144461196,144462653,144473719,144474603,144479811,144482208,144485384,144487547,144488672,144490070,144490928,144493300,144499256,144510943,144513908,144514649,144516228,144516810,144521979,144523015,144531051,144533084,144537581,144539293,144542794,144548639,144550964,144554687,144557156,144577098,144678405,144700086,144730356,144748245,144751805,144754719,144757928,144771906,144774289,144781921,144789193,144793833,144797823,144803035,144820881,144827347,144827610,144828789,144835779,144836300,144839172,144840739,144846804,144850988,","144285821,144291907,144403184,144421970,144423626,144424078,144426459,144428893,144429741,144436138,144437746,144438895,144440471,144444382,144447318,144447781,144448769,144451493,144453869,144459011,144459354,144461342,144462866,144473833,144474759,144479982,144482388,144485519,144487697,144488834,144490199,144491102,144493456,144499427,144511123,144514037,144514820,144516387,144516948,144522171,144523188,144531202,144533260,144537717,144539443,144542870,144548854,144551082,144554893,144557311,144577288,144678578,144700243,144730486,144748514,144751952,144754798,144757989,144771968,144774364,144782123,144789279,144793991,144797990,144803147,144821018,144827386,144827676,144828855,144835938,144836541,144839284,144840832,144846827,144853034,",0,UTRN,cmpl,cmpl,"-1,0,1,0,0,0,0,2,1,0,0,2,0,0,0,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,1,0,1,0,2,0,2,0,1,0,0,1,2,1,1,2,1,0,2,0,2,2,2,2,2,0,1,1,0," +16072,16072,28,NM_004562.3,chr6,-,161347416,162727766,161350098,162727668,12,"161347416,161360087,161386793,161548853,161569354,161785771,161973301,162054090,162201130,162262524,162443309,162727661,","161350211,161360205,161386877,161549003,161569416,161785908,161973417,162054174,162201252,162262765,162443473,162727766,",0,PRKN,cmpl,cmpl,"1,0,0,0,1,2,0,0,1,0,1,0," +14617,14617,228,NM_001080378.2,chr6,+,162727131,163315500,162728235,163314987,6,"162727131,162728159,162814146,163062149,163089258,163314826,","162727279,162728391,162814281,163062321,163089408,163315500,",0,PACRG,cmpl,cmpl,"-1,0,0,0,1,1," +14946,14946,28,XM_017010194.3,chr6,-,165327288,165988052,165333121,165910865,24,"165327288,165336122,165339277,165343390,165379193,165388297,165392645,165395180,165396316,165413500,165416188,165418634,165428657,165430286,165431421,165432973,165435236,165448927,165450241,165482314,165543439,165711018,165910807,165986107,","165333127,165336215,165339358,165343502,165379366,165388453,165392796,165395264,165396459,165413687,165416281,165418777,165428709,165430345,165431472,165433129,165435377,165448977,165450362,165482343,165543568,165711153,165910932,165988052,",0,PDE10A,cmpl,cmpl,"0,0,0,2,0,0,2,2,0,2,2,0,2,0,0,0,0,1,0,1,1,1,0,-1," +16474,16474,73,NM_001291981.2,chr6_KI270801v1_alt,-,313982,869933,315424,869692,33,"313982,318219,318861,321854,322120,326313,328028,328439,330918,335966,336491,337870,340446,340659,342098,343917,346393,348394,350281,358462,414091,416852,428146,431770,433048,439023,533765,568255,589368,591854,671372,746899,869592,","315478,318355,319025,321980,322252,326487,328178,328575,331073,336083,336589,337907,340464,340736,342186,344102,346405,348430,350442,358601,414128,417126,428252,431972,433158,439326,534059,568430,589484,591936,671644,747022,869933,",0,PTPRK,cmpl,cmpl,"0,2,0,0,0,0,0,2,0,0,1,0,0,1,0,1,1,1,2,1,0,2,1,0,1,1,1,0,1,0,1,1,0," +17898,17898,9,NM_152744.4,chr7,+,3301251,4269000,3301586,4265384,45,"3301251,3619079,3639003,3641957,3821449,3950922,3951729,3958930,3962656,3967317,3969256,3971465,3974368,3987185,4010965,4012094,4017170,4049347,4051637,4067837,4076997,4079462,4110662,4113288,4114036,4127380,4129907,4132324,4145721,4149261,4158447,4161785,4174221,4175774,4178484,4205878,4208098,4210024,4220108,4221238,4233254,4237646,4241792,4245675,4265123,","3301884,3619239,3639110,3642105,3821583,3951034,3951920,3959014,3962851,3967434,3969424,3971568,3974545,3987322,4011113,4012235,4017352,4049463,4051830,4067936,4077189,4079584,4110772,4113439,4114274,4127496,4130097,4132423,4145916,4149463,4158551,4161856,4174357,4175834,4178586,4205994,4208285,4210162,4220270,4221364,4233419,4237784,4241913,4245805,4269000,",0,SDK1,cmpl,cmpl,"0,1,2,1,2,1,2,1,1,1,1,1,2,2,1,2,2,1,0,1,1,1,0,2,0,1,0,1,1,1,2,1,0,1,1,1,0,1,1,1,1,1,1,2,0," +4755,4755,10,XM_011515154.3,chr7,-,14145048,14974858,14149130,14841263,26,"14145048,14176838,14178030,14338514,14345300,14478160,14574211,14580861,14583051,14607433,14613339,14621377,14630235,14672928,14682552,14682752,14685244,14694074,14698094,14701680,14718541,14736040,14753927,14757654,14841193,14974695,","14149238,14176899,14178151,14338710,14345391,14478225,14574372,14580951,14583137,14607508,14613413,14621494,14630271,14673027,14682669,14682841,14685362,14694194,14698169,14701730,14718685,14736194,14753948,14757731,14841450,14974858,",0,DGKB,cmpl,cmpl,"0,2,1,0,2,0,1,1,2,2,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,-1," +7833,7833,11,NM_001321868.2,chr7,+,18086824,19002416,18162324,18996062,26,"18086824,18162228,18496261,18585280,18590344,18591515,18593907,18634626,18644670,18647784,18648465,18666212,18727579,18749004,18762156,18767105,18793344,18829160,18829460,18835466,18835899,18874477,18935808,18954145,18975805,18996022,","18087213,18162349,18496324,18585522,18590486,18591642,18594029,18634742,18644793,18647998,18648683,18666476,18727757,18749138,18762277,18767155,18793452,18829216,18829548,18835586,18835997,18874596,18935942,18954230,18975953,19002416,",0,HDAC9,cmpl,cmpl,"-1,0,1,1,0,1,2,1,0,0,1,0,0,1,0,1,0,0,2,0,0,2,1,0,1,2," +3972,3972,12,NM_182899.5,chr7,+,28299320,28825894,28494929,28819279,10,"28299320,28494905,28507615,28570364,28718752,28724221,28804198,28809186,28818070,28819115,","28299441,28494999,28507737,28570537,28718879,28724332,28804522,28809414,28818179,28825894,",0,CREB5,cmpl,cmpl,"-1,0,1,0,2,0,0,0,0,1," +14951,14951,103,NM_001322059.2,chr7,-,31616776,32428224,31790245,32428131,18,"31783509,31809030,31815923,31823072,31824866,31828291,31837179,31837869,31847967,31850640,31864941,31873291,31877969,31878995,31880746,32169784,32209488,32427821,","31790259,31809108,31816154,31823248,31824987,31828373,31837300,31837971,31848096,31850741,31865082,31873408,31878036,31879178,31880860,32169956,32209539,32428224,",0,PDE1C,cmpl,cmpl,"1,1,1,2,1,0,2,2,2,0,0,0,2,2,2,1,1,0," +5490,5490,108,XM_011515654.3,chr7,-,36852905,37449326,36855550,37342690,23,"36852905,36861658,36870392,36878009,36887559,36894853,37013298,37096618,37133129,37211385,37213334,37216644,37222614,37224878,37233094,37244355,37259180,37271831,37314849,37315919,37342612,37428790,37448859,","36855751,36861736,36870475,36878117,36887672,36895017,37013435,37096727,37133234,37211517,37213457,37216695,37222693,37225030,37233194,37244391,37259350,37271882,37314922,37315960,37342763,37429756,37449326,",0,ELMO1,cmpl,cmpl,"0,0,1,1,2,0,1,0,0,0,0,0,2,0,2,2,0,0,2,0,0,-1,-1," +19646,19646,111,XM_047420843.1,chr7,+,40135004,40882597,40135020,40881766,14,"40135004,40180946,40181954,40188494,40189543,40194939,40237634,40274512,40316759,40449286,40459100,40496283,40749433,40881680,","40135120,40180998,40182028,40188580,40189594,40195060,40237726,40274656,40316855,40449358,40459198,40496386,40749497,40882597,",0,SUGCT,cmpl,cmpl,"0,1,2,1,0,0,1,0,0,0,0,2,0,1," +1486,1486,17,XM_047420162.1,chr7,+,69598474,70793506,69599653,70790996,16,"69598474,69899285,70118131,70134535,70435751,70698568,70762869,70764754,70766113,70771548,70777102,70781614,70784941,70785954,70787208,70789747,","69599962,69899498,70118233,70134571,70435781,70698620,70763341,70765005,70766334,70771644,70777174,70781756,70785019,70786038,70787431,70793506,",0,AUTS2,cmpl,cmpl,"0,0,0,0,0,0,1,2,1,0,0,0,1,1,1,2," +6756,6756,17,NM_022479.3,chr7,+,71132143,71713599,71132802,71712146,11,"71132143,71335549,71388234,71415888,71420907,71571284,71665410,71669971,71677210,71710760,71712017,","71133040,71335733,71388401,71416063,71421105,71571402,71665596,71670109,71677306,71710928,71713599,",0,GALNT17,cmpl,cmpl,"0,1,2,1,2,2,0,0,0,0,0," +2354,2354,17,XM_017012677.2,chr7,-,71779490,72504279,71787774,72504113,6,"71779490,71810335,72023656,72106150,72278685,72504111,","71787902,71810492,72023769,72106294,72278810,72504279,",0,CALN1,cmpl,cmpl,"1,0,1,1,2,0," +11575,11575,18,XM_017012840.3,chr7,-,78017054,79453667,78019314,79453320,25,"78017054,78026067,78065573,78078946,78125693,78127196,78132888,78135020,78160024,78167915,78178010,78185628,78194873,78201161,78255942,78343777,78345443,78345921,78369155,78489760,78501576,78521429,78627119,79007089,79453019,","78019976,78026104,78065662,78079085,78125837,78127416,78133060,78135206,78160273,78168108,78178102,78185670,78195063,78201193,78256581,78343960,78345572,78346043,78369213,78489840,78501787,78521645,78627239,79007206,79453667,",0,MAGI2,cmpl,cmpl,"1,0,1,0,0,2,1,1,1,0,1,1,0,1,1,1,1,2,1,2,1,1,1,1,0," +17964,17964,153,XM_047419751.1,chr7,-,83955776,84492725,83961370,84194586,21,"83955776,83963204,83977131,83981320,83985435,84001954,84005338,84007352,84011021,84011182,84014208,84046323,84060464,84110469,84129122,84134793,84194474,84295041,84307206,84371823,84492459,","83961826,83963347,83977196,83981478,83985477,84002046,84005558,84007497,84011091,84011297,84014351,84046443,84060558,84110589,84129185,84134951,84194668,84295131,84307292,84371900,84492725,",0,SEMA3A,cmpl,cmpl,"0,1,2,0,0,1,0,2,1,0,1,1,0,0,0,1,0,-1,-1,-1,-1," +23192,23192,19,NM_181646.5,chr7,+,88759699,89338528,88759976,89337032,4,"88759699,89218154,89327343,89333362,","88760084,89218295,89327474,89338528,",0,ZNF804B,cmpl,cmpl,"0,0,0,2," +2981,2981,159,NM_001287135.2,chr7,+,90596320,91210590,90596627,91118180,15,"90596320,90604217,90726566,90747680,90790572,90863174,90899290,90917600,90955696,90984147,91045896,91079431,91112541,91118064,91207164,","90596718,90604249,90726812,90747775,90790652,90863269,90899353,90917724,90955817,90984241,91045960,91079480,91112681,91118208,91210590,",0,CDK14,cmpl,cmpl,"0,1,0,0,2,1,0,0,1,2,0,1,2,1,-1," +10240,10240,21,NM_199000.3,chr7,+,104328602,104908561,104328779,104906215,3,"104328602,104736674,104906186,","104329224,104736911,104908561,",0,LHFPL3,cmpl,cmpl,"0,1,1," +8697,8697,22,XM_047420927.1,chr7,-,110662643,111562492,110663711,111521447,7,"110662643,110963499,111417258,111487237,111521312,111532517,111561850,","110663721,110963565,111417342,111487341,111521449,111532585,111562492,",0,IMMP2L,cmpl,cmpl,"2,2,2,0,0,-1,-1," +2329,2329,23,XM_017012796.3,chr7,-,122318410,122886460,122320164,122663539,33,"122318410,122325476,122345573,122360787,122360929,122379367,122386294,122387025,122388582,122393195,122393440,122407539,122414067,122416060,122436359,122438340,122441511,122451373,122471374,122474380,122480851,122490080,122491311,122513248,122554549,122581178,122615180,122621480,122629247,122663236,122736954,122885393,122885998,","122320338,122325581,122345681,122360820,122361013,122379442,122386309,122387173,122388738,122393315,122393582,122407696,122414076,122416164,122436380,122438464,122441575,122451475,122471562,122474517,122480860,122490281,122491420,122513315,122554689,122581290,122615299,122621717,122629328,122663569,122737068,122885445,122886460,",0,CADPS2,cmpl,cmpl,"0,0,0,0,0,0,0,2,2,2,1,0,0,1,1,0,2,2,0,1,1,1,0,2,0,2,0,0,0,0,-1,-1,-1," +7507,7507,24,NM_001371083.1,chr7,-,126438597,127252941,126439118,127243204,12,"126438597,126446125,126532951,126609361,126769864,126902541,126903971,126904547,127106495,127242694,127248219,127252796,","126439168,126446372,126533887,126609498,126770065,126902679,126904126,126904683,127106712,127243515,127248323,127252941,",0,GRM8,cmpl,cmpl,"1,0,0,1,1,1,2,1,0,0,-1,-1," +15509,15509,199,XM_047421017.1,chr7,-,132123339,132648688,132130478,132508693,33,"132123339,132133048,132140598,132145118,132146509,132147899,132148542,132159472,132164141,132165133,132168303,132174777,132179686,132180585,132181380,132182096,132185298,132187470,132194061,132198484,132202645,132203322,132210942,132223526,132226160,132227450,132228345,132241065,132298090,132489291,132507505,132565464,132576421,","132130574,132133199,132140811,132145288,132146700,132147999,132148646,132159632,132164288,132165200,132168572,132174920,132179921,132180732,132181620,132182190,132185463,132187607,132194179,132198636,132202836,132203419,132211143,132223641,132226260,132227604,132228469,132241166,132298222,132489474,132508779,132565647,132576470,",0,PLXNA4,cmpl,cmpl,"0,2,2,0,1,0,1,0,0,2,0,1,0,0,0,2,2,0,2,0,1,0,0,2,1,0,2,0,0,0,0,-1,-1," +5814,5814,200,NM_021807.4,chr7,+,133253077,134065761,133253101,134064528,18,"133253077,133274981,133288921,133305876,133317283,133356329,133374827,133475327,133480049,133630044,133817324,133895598,133917582,133937890,133997491,134004911,134007675,134064290,","133253187,133275171,133289116,133306061,133317390,133356573,133375002,133475473,133480138,133630141,133817544,133895735,133917738,133938069,133997633,134005090,134007835,134065761,",0,EXOC4,cmpl,cmpl,"0,2,0,0,2,1,2,0,2,1,2,0,2,2,1,2,1,2," +3707,3707,26,NM_014141.6,chr7,+,146116800,148420998,146116876,148415616,24,"146116800,146774270,146839710,147043906,147108146,147120978,147128692,147132244,147300140,147395608,147485934,147562137,147639105,147903564,147977861,148118117,148147490,148172241,148217287,148229645,148267032,148383648,148409390,148415416,","146116973,146774381,146839904,147044054,147108350,147121163,147128836,147132509,147300290,147395780,147486041,147562257,147639306,147903721,147977989,148118288,148147709,148172478,148217524,148229779,148267126,148383888,148409471,148420998,",0,CNTNAP2,cmpl,cmpl,"0,1,1,0,1,1,0,0,1,1,2,1,1,1,2,1,1,1,1,1,0,1,1,1," +5068,5068,27,NM_001364498.2,chr7,+,153748132,154894285,153748948,154892480,27,"153748132,153748754,154446213,154474938,154540531,154566841,154637820,154669359,154727766,154769416,154772844,154794078,154795844,154801354,154803863,154804916,154806993,154853779,154867994,154872623,154875905,154880887,154885632,154887675,154889271,154889456,154892333,","153748319,153749008,154446328,154475037,154540626,154566916,154637873,154669441,154727887,154769571,154772942,154794202,154795883,154801462,154803955,154804964,154807112,154853827,154868093,154872693,154876100,154880942,154885744,154887734,154889344,154889530,154894285,",0,DPP6,cmpl,cmpl,"-1,0,0,1,1,0,0,2,0,1,0,2,0,0,0,2,2,1,1,1,2,2,0,1,0,1,0," +16477,16477,27,NM_002847.5,chr7,-,157539055,158587823,157540713,158587669,23,"157539055,157548945,157568901,157571439,157576612,157578020,157595237,157604001,157621361,157656356,157682724,157898672,158081297,158110828,158133676,158136654,158138293,158166930,158192326,158205170,158316818,158489734,158587557,","157540785,157549019,157568966,157571493,157576779,157578140,157595315,157604075,157621509,157656551,157682937,157898737,158081377,158110915,158134059,158136695,158138515,158167291,158192495,158205273,158316932,158489785,158587823,",0,PTPRN2,cmpl,cmpl,"0,1,2,2,0,0,0,1,0,0,0,1,2,2,0,1,1,0,2,1,1,1,0," +4866,4866,9,NM_001346810.2,chr8,+,737627,1708476,737807,1701406,15,"737627,907911,1258850,1501365,1548625,1565682,1626739,1632826,1668328,1669742,1676532,1678213,1691534,1697146,1701187,","737825,907966,1258883,1501431,1549683,1565894,1626887,1633046,1668678,1669784,1676618,1678629,1691626,1697299,1708476,",0,DLGAP2,cmpl,cmpl,"0,0,1,1,1,0,2,0,1,0,0,2,1,0,0," +4078,4078,9,NM_033225.6,chr8,-,2935360,4994914,2938584,4994416,70,"2935360,2942471,2949298,2950230,2951113,2954223,2955588,2957695,2961140,2962465,2963221,2965774,2966569,2973116,2974450,2978611,2998010,2999957,3018476,3029318,3052461,3087096,3091515,3096848,3106527,3107717,3108602,3110157,3118398,3142464,3151396,3157896,3162158,3181109,3187868,3188886,3189911,3199713,3201611,3205503,3214496,3219254,3223728,3230039,3284143,3307694,3308311,3343293,3347991,3359151,3367031,3369253,3387493,3396193,3399390,3406026,3407898,3409422,3468711,3493622,3574944,3586135,3616709,3708413,3753929,3997902,4031904,4419952,4637341,4994331,","2938744,2942604,2949386,2950343,2951275,2954268,2955768,2957807,2961214,2962639,2963395,2965954,2966746,2973299,2974624,2978800,2998184,3000131,3018650,3029513,3052647,3087285,3091662,3097037,3106641,3107798,3108748,3110335,3118587,3142674,3151513,3157966,3162277,3181214,3187965,3189011,3190115,3199809,3201725,3205620,3214691,3219442,3223867,3230231,3284346,3307821,3308503,3343450,3348161,3359340,3367247,3369370,3387682,3396381,3399529,3406221,3408225,3409605,3468824,3493726,3575066,3586260,3616797,3708491,3754042,3998110,4032099,4420065,4637558,4994914,",0,CSMD1,cmpl,cmpl,"2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,0,1,1,0,1,1,1,1,1,1,2,1,1,2,1,1,0,1,1,1,1,1,2,1,1,1,1,2,0,1,2,1,1,2,1,1,2,1,0," +18146,18146,10,NM_139167.4,chr8,-,14084844,15238431,14090442,15237623,8,"14084844,14102375,14108162,14164579,14237591,14324102,14554731,15237584,","14090637,14102499,14108235,14164702,14237679,14324204,14554926,15238431,",0,SGCZ,cmpl,cmpl,"0,2,1,1,0,0,0,0," +16276,16276,11,NM_001412866.1,chr8,-,18527302,19084805,18535742,19084529,17,"18527302,18556208,18572527,18575127,18600363,18632612,18655641,18735234,18765448,18799294,18801269,18804521,18804703,18867673,18871625,18936033,19084205,","18535958,18556352,18572672,18575285,18600434,18632806,18655685,18735315,18765538,18799353,18801382,18804602,18804898,18868069,18872733,18936142,19084805,",0,PSD3,cmpl,cmpl,"0,0,2,0,1,2,0,0,0,1,2,2,2,2,1,0,0," +13576,13576,12,NM_001322202.2,chr8,+,31639244,32774046,32754445,32764402,9,"31639244,32595827,32605561,32727948,32754371,32756402,32759305,32760199,32763747,","31639431,32596005,32605683,32728078,32754474,32756529,32759436,32760406,32774046,",0,NRG1,cmpl,cmpl,"-1,-1,-1,-1,0,2,0,2,2," +21803,21803,13,XM_047421381.1,chr8,+,35235474,35796540,35235784,35790563,15,"35235474,35549291,35568097,35595553,35683546,35686544,35722209,35726151,35731011,35748526,35750581,35759319,35766901,35774298,35790358,","35235887,35549510,35568241,35595657,35683727,35686709,35722356,35726529,35731096,35748695,35750809,35759469,35767066,35774477,35796540,",0,UNC5D,cmpl,cmpl,"0,1,1,1,0,1,1,1,1,2,0,0,0,0,2," +18992,18992,1,NM_001287813.3,chr8,+,49909795,50796692,50394238,50792829,20,"49909795,49910891,50172560,50394211,50402209,50438542,50449667,50450555,50450687,50502777,50530176,50536677,50553049,50590878,50656908,50658591,50704599,50708885,50752000,50792670,","49909904,49911044,50172635,50394265,50402344,50438599,50449725,50450599,50450729,50502880,50530259,50536808,50553179,50590917,50657025,50658663,50704752,50708978,50752111,50796692,",0,SNTG1,cmpl,cmpl,"-1,-1,-1,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,0," +3606,3606,131,XM_024447079.2,chr8,+,60964847,61501629,61299827,61499542,9,"60964847,60966169,61131769,61292317,61299676,61376604,61454140,61458306,61499454,","60965361,60966237,61131860,61292454,61300282,61376779,61454251,61458542,61501629,",0,CLVS1,cmpl,cmpl,"-1,-1,-1,-1,0,2,0,0,2," +13269,13269,16,XM_017013359.2,chr8,+,62248853,62999652,62249073,62999334,6,"62248853,62579538,62589713,62746931,62918452,62999230,","62249127,62579676,62589794,62747129,62918513,62999652,",0,NKAIN3,cmpl,cmpl,"0,0,0,0,0,1," +16716,16716,153,XM_024447066.2,chr8,+,84182786,84921844,84529321,84920911,8,"84182786,84529298,84774578,84849979,84862295,84873283,84887603,84920893,","84182944,84529577,84774654,84850027,84862453,84873397,84887776,84921844,",0,RALYL,cmpl,cmpl,"-1,0,1,2,2,1,1,0," +3636,3636,19,XM_017013149.2,chr8,+,86866414,87428652,86866495,87428637,11,"86866438,86887541,86905080,86939595,87205992,87236918,87284677,87286538,87351684,87353635,87428545,","86866583,86887611,86905194,86939754,87206138,87237112,87284815,87286671,87351794,87353786,87428652,",0,CNBD1,cmpl,cmpl,"0,1,2,2,2,1,0,0,1,0,1," +19557,19557,20,XM_017013756.2,chr8,-,98387878,98942610,98434253,98942470,13,"98429935,98437102,98526741,98547968,98579663,98596031,98706466,98707146,98749275,98767242,98774738,98883646,98942377,","98434318,98437208,98526917,98548161,98579789,98596169,98706634,98707311,98749390,98767371,98774819,98883834,98942610,",0,STK3,cmpl,cmpl,"1,0,1,0,0,0,0,0,2,2,2,0,0," +22072,22072,20,NM_017890.5,chr8,+,99013273,99877580,99013788,99875666,62,"99013273,99013759,99038422,99096311,99102952,99111097,99115699,99121176,99134631,99135014,99135595,99136664,99142973,99147840,99156548,99170038,99192875,99274197,99275080,99384207,99391556,99431536,99442400,99467413,99481598,99501686,99502835,99507769,99511103,99520898,99556449,99575657,99577489,99641810,99661353,99699524,99717170,99720344,99720862,99766773,99776774,99778681,99784314,99809374,99817539,99818450,99818712,99819411,99819920,99821293,99823831,99832368,99835196,99835538,99848775,99853450,99859303,99861775,99868288,99870784,99871447,99875417,","99013347,99013935,99038566,99096432,99103120,99111279,99115874,99121445,99134727,99135137,99135733,99136752,99143165,99148010,99156743,99170163,99193057,99274332,99275254,99384317,99391704,99431664,99442635,99467634,99481802,99501858,99502950,99507911,99511512,99521010,99556653,99575784,99577633,99642498,99661491,99699932,99717373,99720552,99721047,99766970,99776956,99779031,99784476,99809530,99817803,99818534,99818888,99819582,99820122,99821482,99823978,99832652,99835324,99835738,99848894,99854256,99859480,99861946,99868465,99870887,99871697,99877580,",0,VPS13B,cmpl,cmpl,"-1,0,0,0,1,1,0,1,0,0,0,0,1,1,0,0,2,1,1,1,0,1,0,1,0,0,1,2,0,1,2,2,0,0,1,1,1,0,1,0,2,1,0,0,0,0,0,2,2,0,0,0,2,1,0,2,1,1,1,1,2,0," +17130,17130,21,NM_001348497.2,chr8,+,103500609,104256094,103500886,104251820,23,"103500609,103697085,103766226,103885297,103910320,103912052,103915494,103916413,103918440,103921671,103931262,103936550,103942772,103961064,103975349,103989304,104014505,104015423,104244915,104248700,104249486,104251023,104251601,","103501062,103697296,103766537,103886223,103910529,103912172,103915594,103916537,103918487,103921784,103931393,103936722,103942926,103961133,103975506,103989421,104014615,104015456,104245057,104248813,104249588,104251163,104256094,",0,RIMS2,cmpl,cmpl,"0,2,0,2,1,0,0,1,2,1,0,2,0,1,1,2,2,1,1,2,1,1,0," +4080,4080,22,NM_198123.2,chr8,-,112222927,113436939,112224770,113436854,71,"112222927,112228755,112231544,112234364,112237189,112241719,112244393,112247019,112254252,112255253,112263638,112265410,112281173,112287063,112289364,112291509,112292536,112295832,112301792,112304720,112306006,112310977,112313905,112314428,112318836,112319900,112335328,112336651,112337542,112341476,112346096,112351174,112352415,112380351,112383566,112390663,112406523,112408317,112408918,112472590,112492488,112503789,112506690,112517033,112550670,112552593,112556762,112573500,112587065,112636816,112638695,112645108,112650160,112656153,112666276,112682441,112685405,112689867,112800161,112829685,112859144,112921626,112947789,112954683,112975836,113019066,113098755,113173721,113278591,113314570,113436676,","112224930,112228891,112231632,112234477,112237348,112241785,112244573,112247131,112254326,112255427,112263812,112265590,112281350,112287246,112289538,112291695,112292710,112296006,112301966,112304915,112306192,112311166,112314052,112314617,112318950,112319981,112335474,112336829,112337731,112341686,112346213,112351244,112352534,112380456,112383663,112390788,112406727,112408413,112409032,112472707,112492683,112503977,112506829,112517225,112550873,112552720,112556954,112573657,112587235,112637005,112638911,112645225,112650349,112656341,112666415,112682636,112685732,112690050,112800274,112829789,112859266,112921751,112947877,112954761,112976148,113019179,113098963,113173916,113278704,113314793,113436939,",0,CSMD3,cmpl,cmpl,"2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,0,1,1,0,1,1,1,1,1,1,2,1,1,2,1,1,0,1,1,1,1,1,2,1,1,1,1,2,0,1,2,1,1,1,2,1,1,2,1,0," +21106,21106,206,NM_031466.8,chr8,-,139727724,140458579,139731060,140451373,23,"139727724,139731978,139885878,139910146,139988725,140023936,140221458,140252776,140275657,140283888,140287607,140290992,140300468,140311247,140360049,140370963,140397619,140405576,140426614,140435111,140439051,140450789,140458280,","139731228,139732202,139885969,139910300,139988836,140024079,140221583,140252929,140275821,140284021,140287734,140291078,140300614,140311374,140360193,140371180,140397745,140405698,140426641,140435240,140439197,140451383,140458579,",0,TRAPPC9,cmpl,cmpl,"0,1,0,2,2,0,1,1,2,1,0,1,2,1,1,0,0,1,1,1,2,0,-1," +7037,7037,9,XM_047422890.1,chr9,-,3824126,4490465,3828271,4348261,12,"3824126,3829309,3856008,3879426,3898690,3932359,3937027,4117767,4125733,4347080,4348231,4489627,","3828408,3829492,3856184,3879595,3898835,3932470,3937189,4118881,4125941,4347183,4348412,4490465,",0,GLIS3,cmpl,cmpl,"1,1,2,1,0,0,0,2,1,0,0,-1," +16467,16467,1,NM_002839.4,chr9,-,8314245,10613002,8317873,8733843,46,"8314245,8319830,8331581,8338921,8340342,8341089,8341692,8375935,8376606,8389231,8404536,8436591,8449724,8460410,8465465,8470994,8484118,8485226,8485761,8492861,8497241,8499646,8500753,8504260,8507300,8517847,8521276,8523512,8524924,8526626,8527344,8528590,8633316,8636698,8733779,9018696,9183303,9397448,9574731,9734532,9766809,9938506,10033717,10340962,10612397,10612687,","8317942,8319966,8331736,8339047,8340469,8341268,8341978,8376090,8376726,8389407,8404660,8436689,8449837,8460571,8465675,8471085,8484378,8485324,8486349,8492979,8497268,8499840,8501059,8504405,8507434,8518429,8521546,8523524,8525035,8526644,8527353,8528779,8633458,8636844,8733946,9018735,9183363,9397482,9574781,9734571,9766851,9938610,10033790,10341017,10612505,10613002,",0,PTPRD,cmpl,cmpl,"0,2,0,0,2,0,2,0,0,1,0,1,2,0,0,2,0,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,0,1,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1," +326,326,11,XM_017015312.3,chr9,+,17906632,18910950,17906826,18908548,29,"17906632,18163861,18504828,18533246,18574029,18622242,18635942,18639253,18657638,18661934,18680311,18681811,18684715,18706746,18721535,18753297,18770601,18775742,18776780,18795396,18817108,18826283,18829842,18887830,18889567,18892388,18905781,18906691,18908441,","17906922,18163981,18504956,18533292,18574266,18622369,18636017,18639411,18657750,18662073,18680516,18681959,18684800,18707048,18721665,18753508,18770781,18775896,18777906,18795524,18817237,18826463,18829977,18888043,18889748,18892596,18905891,18906912,18910950,",0,ADAMTSL1,cmpl,cmpl,"0,0,0,2,0,0,1,1,0,1,2,0,1,2,1,2,0,0,1,2,1,1,1,1,1,2,0,2,1," +18445,18445,11,XM_017014592.2,chr9,-,19507454,20307892,19516152,19786866,15,"19507454,19520893,19528048,19550136,19573350,19576923,19597228,19619583,19622260,19785936,19849648,19891130,20004249,20301225,20307652,","19516402,19521060,19528138,19550268,19573469,19577022,19597279,19619692,19622299,19787019,19849787,19891228,20004387,20301364,20307892,",0,SLC24A2,cmpl,cmpl,"2,0,0,0,1,1,1,0,0,0,-1,-1,-1,-1,-1," +10335,10335,12,XM_047422812.1,chr9,-,27937616,29213601,27948850,27950671,8,"27937616,28295207,28372835,28475939,28670199,28913791,28947817,29212921,","27950706,28295366,28372868,28476025,28670280,28913874,28947914,29213601,",0,LINGO2,cmpl,cmpl,"0,-1,-1,-1,-1,-1,-1,-1," +21314,21314,17,NM_001366142.2,chr9,-,70529059,71446971,70535428,71446835,27,"70529059,70537121,70549541,70552843,70553159,70591030,70598418,70603341,70610608,70615907,70618866,70620075,70621243,70625190,70625481,70635210,70639059,70640559,70681505,70761600,70784104,70827846,70843002,70846377,70862907,70864431,71446652,","70535451,70537405,70549674,70553043,70553310,70591205,70598670,70603470,70610749,70616075,70619095,70620365,70621273,70625331,70625517,70635261,70639194,70640660,70681578,70761724,70784279,70828018,70843127,70846591,70863112,70864511,71446971,",0,TRPM3,cmpl,cmpl,"1,2,1,2,1,0,0,0,0,0,2,0,0,0,0,0,0,1,0,2,1,0,1,0,2,0,0," +14665,14665,22,XM_047423419.1,chr9,+,109640786,110172512,109780488,110168421,8,"109780197,109867490,109880550,109923734,109925060,110015953,110136126,110168398,","109780533,109867571,109880681,109923849,109925082,110016039,110138539,110172512,",0,PALM2AKAP2,cmpl,cmpl,"0,0,0,2,0,1,0,1," +1289,1289,184,NM_001365068.1,chr9,-,116423111,117415057,116425850,117414938,23,"116423111,116440608,116442452,116487358,116618323,116620309,116651527,116725770,116728991,116733398,116805631,116820616,116863582,116975207,116976113,116976700,117008091,117039818,117096043,117141325,117214357,117291325,117414496,","116426088,116440792,116442553,116487500,116618472,116620443,116651793,116725950,116729096,116733523,116805820,116820783,116863733,116975345,116976188,116976785,117008259,117039965,117096151,117141478,117214742,117291513,117415057,",0,ASTN2,cmpl,cmpl,"2,1,2,1,2,0,1,1,1,2,2,0,2,2,2,1,1,1,1,1,0,1,0," +4709,4709,23,NM_001352964.2,chr9,-,123379657,123930126,123381431,123929905,24,"123379657,123383654,123387729,123403401,123411775,123440359,123450692,123452275,123454738,123457347,123457792,123557569,123583168,123609435,123630375,123652012,123667025,123671290,123676719,123757702,123769513,123792586,123878950,123929888,","123382625,123383913,123387858,123403490,123411829,123440491,123450749,123452347,123454779,123457435,123457897,123557695,123583270,123609481,123630476,123652123,123667079,123671371,123676789,123757822,123769563,123792630,123879021,123930126,",0,DENND1A,cmpl,cmpl,"0,2,2,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,2,0,1,2,0," +1070,1070,10,NM_013427.3,chrX,-,11137543,11665920,11138862,11664828,13,"11137543,11142232,11143979,11156528,11169504,11178099,11179301,11182062,11186235,11188727,11196924,11254547,11664240,","11139530,11142313,11144248,11156626,11169684,11178248,11179452,11182118,11186431,11188984,11196996,11254707,11665920,",0,ARHGAP6,cmpl,cmpl,"1,1,2,0,0,1,0,1,0,1,1,0,0," +6580,6580,10,NM_001368395.3,chrX,+,11822438,12724523,11865188,12721858,19,"11822438,11865085,11877894,12498679,12609720,12614778,12674862,12683482,12686096,12690194,12694334,12701873,12704358,12706825,12707468,12710398,12716068,12717500,12720533,","11822715,11865216,11878018,12498796,12609881,12614881,12674908,12683587,12686204,12690326,12694454,12702010,12704485,12706915,12707651,12710537,12717133,12718790,12724523,",0,FRMPD4,cmpl,cmpl,"-1,0,1,2,2,1,2,0,0,0,0,0,2,0,0,0,1,1,1," +8637,8637,12,NM_014271.4,chrX,+,28587445,29956718,28789343,29955820,11,"28587445,28789319,29282937,29396257,29399154,29668429,29917463,29919948,29941650,29954521,29955101,","28588047,28789425,29283217,29396444,29399308,29668504,29917596,29920094,29941794,29954692,29956718,",0,IL1RAPL1,cmpl,cmpl,"-1,0,1,2,0,1,1,2,1,1,1," +4894,4894,12,NM_000109.4,chrX,-,31119221,33339388,31121918,33339265,79,"31119221,31126641,31134101,31146290,31147274,31169442,31172347,31173538,31177931,31178668,31180369,31182737,31203960,31206581,31209497,31223046,31260954,31323597,31348555,31444480,31478105,31478982,31496787,31507280,31627672,31657989,31679374,31729630,31773959,31819974,31836717,31875187,31929595,31932079,31968338,32216915,32287528,32310081,32342099,32343133,32345942,32348405,32362787,32364581,32365019,32380509,32386309,32389500,32390070,32411751,32438240,32441179,32448455,32454661,32463438,32464585,32468497,32472163,32484918,32491276,32501754,32518007,32545158,32565701,32573529,32573744,32595756,32614302,32644131,32644963,32697869,32699111,32809492,32816467,32823294,32844782,32849727,33020138,33339258,","31121930,31126673,31134194,31146414,31147518,31169601,31172413,31173604,31177970,31178805,31180481,31182904,31204118,31206667,31209699,31223121,31261016,31323658,31348634,31444627,31478374,31479103,31496944,31507453,31627862,31658144,31679586,31729748,31774192,31820083,31836819,31875373,31929745,31932227,31968514,32217063,32287701,32310276,32342282,32343286,32346080,32348528,32362958,32364710,32365199,32380680,32386465,32389674,32390181,32411913,32438390,32441314,32448638,32454832,32463594,32464699,32468710,32472309,32485099,32491518,32501842,32518131,32545334,32565881,32573637,32573846,32595876,32614453,32644313,32645152,32697998,32699293,32809611,32816640,32823387,32844860,32849820,33020200,33339388,",0,DMD,cmpl,cmpl,"0,1,1,0,2,2,2,2,2,0,2,0,1,2,1,1,2,1,0,0,1,0,2,0,2,0,1,0,1,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,1,0," +4421,4421,155,XM_017029255.2,chrX,+,86148450,86832602,86148620,86832155,11,"86148450,86514278,86651035,86695020,86714547,86739746,86812855,86813129,86814687,86816033,86832105,","86149108,86514391,86651167,86695179,86714720,86739882,86813004,86813277,86814834,86816099,86832602,",0,DACH2,cmpl,cmpl,"0,2,1,1,1,0,1,0,1,1,1," +14785,14785,2,XM_011530914.3,chrX,+,91779374,92623230,91835504,92618940,8,"91779374,91835460,91876780,92201374,92263113,92387734,92468298,92618263,","91779684,91836044,91879273,92201455,92263143,92387933,92468322,92623230,",0,PCDH11X,cmpl,cmpl,"-1,0,0,0,0,0,1,1," +4821,4821,20,NM_006729.5,chrX,+,96684841,97604997,96685058,97599317,27,"96684841,96735757,96738585,96758153,96881578,96912327,96912482,96916437,96918508,96930732,96937232,96939265,96942017,96945526,96948934,96957827,96965092,97072940,97075166,97099693,97114725,97141664,97247714,97348115,97383908,97429649,97599252,","96685190,96735790,96738762,96758258,96881718,96912402,96912552,96916574,96918617,96930843,96937351,96939382,96942136,96945591,96949039,96958148,96965207,97073042,97075261,97099795,97114965,97141794,97247839,97348280,97384044,97429745,97604997,",0,DIAPH2,cmpl,cmpl,"0,0,0,0,0,2,2,0,2,0,0,2,2,1,0,0,0,1,1,0,0,0,1,0,0,1,1," +8638,8638,21,NM_017416.2,chrX,+,104566198,105767829,104658913,105767661,11,"104566198,104658894,105195474,105233817,105267387,105484312,105717366,105740545,105748959,105755176,105766963,","104567051,104658995,105195748,105234004,105267541,105484387,105717496,105740691,105749103,105755347,105767829,",0,IL1RAPL2,cmpl,cmpl,"-1,0,1,2,0,1,1,2,1,1,1," +20211,20211,23,XM_017029213.3,chrX,-,124375902,125204312,124380535,124963753,35,"124375902,124382669,124383633,124385676,124392051,124405030,124406316,124420310,124422271,124453336,124470724,124481731,124487208,124497015,124503559,124520516,124523363,124529863,124546873,124561670,124563748,124565374,124641790,124645142,124646708,124651913,124653583,124671682,124705012,124736956,124894295,124895980,124963536,125185642,125203799,","124381294,124382812,124384854,124386064,124392348,124405266,124406489,124420821,124422638,124453491,124470823,124481964,124487229,124497265,124503703,124520784,124523625,124529983,124547090,124561817,124563772,124565560,124641991,124645337,124646810,124652124,124653783,124671835,124705251,124737197,124894352,124896241,124963817,125185709,125204312,",0,TENM1,cmpl,cmpl,"0,1,1,0,0,1,2,1,0,1,1,2,2,1,1,0,2,2,1,1,1,1,1,1,1,0,1,1,2,1,1,1,0,-1,-1," +6323,6323,205,NM_001139501.2,chrX,-,138614726,139205023,138632849,138857641,8,"138614726,138635456,138702983,138708817,138857511,138990507,139204058,139204920,","138632986,138635655,138703087,138708928,138857679,138990617,139204271,139205023,",0,FGF13,cmpl,cmpl,"1,0,1,1,0,-1,-1,-1," +14786,14786,9,XM_017030080.2,chrY,+,5000295,5742228,5032696,5737942,10,"5000295,5030898,5031214,5031905,5032669,5056888,5098214,5501056,5581774,5737271,","5000605,5031067,5031346,5032011,5032728,5057459,5100707,5501255,5581798,5742228,",0,PCDH11Y,cmpl,cmpl,"-1,-1,-1,-1,0,2,0,0,1,1," diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..79c7888 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,98 @@ +manifest { + name = 'digital patients' + author = 'omic' + recurseSubmodules = true + homePage = 'https://gitlab.com/omic/next/registry/pipelines/digitalpatients' + description = 'generative digital patients and multi-omics pipeline' + mainScript = 'main.nf' + nextflowVersion = '!>=21.04.3' + defaultBranch = 'master' +} + +// docker { +// enabled = true +// temp = 'auto' +// } + +// process { +// withLabel: 'gpu_process' { +// containerOptions = '--gpus all --rm' +// } +// +// withLabel: 'cpu_process' { +// containerOptions = '--rm' +// } +// } + +def sharedPod = [ + [env: 'NXF_DEBUG', value: '0'], + [label: 'omic-app', value: 'digitalpatients'], + [imagePullSecret: 'gitlab-registry-secret'], + [volumeClaim: 'avatar-new', mountPath: '/mnt/Avatar/'], +] + +profiles { + docker { + docker.enabled = true + + process { + executor = 'local' + + withLabel: 'gpu_process' { + maxForks = 1 // Only one GPU task at a time on single GPU system + containerOptions = '--gpus all --rm' + } + + withLabel: 'cpu_process' { + containerOptions = '--rm' + } + } + } + + k8s { + process { + + executor = 'k8s' + namespace = 'bioinformatics' + debug = true + + pod = sharedPod + + withLabel: 'gpu_process' { + + pod = sharedPod + [ + [nodeSelector: [gpu: 'yes', 'gpu-type': 'geforce-rtx-3090']] + ] + } + } + + workDir = "/mnt/dreamdock-data/digital-patient-data/work" + + k8s { + serviceAccount = 'nextflow-sa' + namespace = 'bioinformatics' + storageClaimName = 'dreamdock-data' + storageMountPath = '/mnt/dreamdock-data' + + pullPolicy = 'IfNotPresent' + cleanup = true // delete pods after Ctrl+C or finished? + // cleanup = false // delete pods after Ctrl+C or finished? + + + // RUN AS DIFFERENT USERS + // securityContext = [fsGroup: 1000] + // securityContext = [ + // runAsUser: 1000, + // fsGroup: 1000, + // runAsNonRoot: true + // ] + } + + + + // Use container image + // process.container = 'harbor.cluster.omic.ai/omic/faiss-indexer:latest' + + } +} + diff --git a/params.json b/params.json new file mode 100644 index 0000000..152470f --- /dev/null +++ b/params.json @@ -0,0 +1,325 @@ +{ + "params": { + "// SYNTHEA PARAMETERS": {}, + "n_pat": { + "type": "integer", + "description": "Number of patients to generate (must be >= 2)", + "default": 2, + "required": true, + "pipeline_io": "parameter", + "var_name": "params.n_pat", + "examples": [2, 10, 100], + "pattern": "^[0-9]+$", + "validation": { + "min": 2 + }, + "notes": "Minimum of 2 patients required (one male, one female)" + }, + "percent_male": { + "type": "number", + "description": "Percentage of male patients (between 0-1)", + "default": 0.5, + "required": true, + "pipeline_io": "parameter", + "var_name": "params.percent_male", + "examples": [0.0, 0.5, 1.0], + "pattern": "^0(\\.\\d+)?|1(\\.0)?$", + "validation": { + "min": 0.0, + "max": 1.0 + }, + "notes": "Value must be between 0 and 1 inclusive" + }, + "imputed_store": { + "type": "folder", + "description": "Path to imputed UKBB data", + "default": "/mnt/Avatar/imputed/ukbb/imputed", + "required": true, + "pipeline_io": "input", + "var_name": "params.imputed_store", + "examples": [ + "/mnt/Avatar/imputed/ukbb/imputed", + "/rosalind/ukbb/imputed" + ], + "pattern": ".*", + "validation": {}, + "notes": "Directory containing imputed UKBB data" + }, + "pheno_store": { + "type": "file", + "description": "Path to phenotype data file", + "default": "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv", + "required": true, + "pipeline_io": "input", + "var_name": "params.pheno_store", + "examples": [ + "/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv" + ], + "pattern": ".*\\.csv$", + "validation": {}, + "notes": "CSV file containing filtered UKBB phenotype data" + }, + "disease": { + "type": "string", + "description": "Disease or condition to simulate", + "default": "schizophrenia", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.disease", + "examples": [ + "schizophrenia", + "healthy", + "leukaemia", + "Purpura and other haemorrhagic conditions" + ], + "pattern": ".*", + "enum": [ + "schizophrenia", + "healthy", + "leukaemia", + "Purpura and other haemorrhagic conditions" + ], + "validation": {}, + "notes": "Use 'healthy' for healthy individuals or specify a disease condition" + }, + "healthy_dir": { + "type": "folder", + "description": "Path to healthy patient data directory", + "default": "/Workspace/next/registry/pipelines/digital_patient", + "required": true, + "pipeline_io": "input", + "var_name": "params.healthy_dir", + "examples": [ + "/Workspace/next/registry/pipelines/digital_patient", + "/mnt/Avatar/digital_patient" + ], + "pattern": ".*", + "validation": {}, + "notes": "Directory containing data for healthy patient generation" + }, + "outdir": { + "type": "folder", + "description": "Output directory for results", + "default": "/mnt/omic-next-apis/wes/digital_patients", + "required": true, + "pipeline_io": "output", + "var_name": "params.outdir", + "examples": [ + "/mnt/OmicNAS/dd/digital_patient", + "/path/to/custom/output" + ], + "pattern": ".*", + "validation": {}, + "notes": "Directory where all pipeline results will be stored" + }, + "// BORZOI PARAMETERS": {}, + "container_borzoi": { + "type": "string", + "description": "Borzoi container image", + "default": "borzoi:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_borzoi", + "examples": [ + "borzoi:latest", + "borzoi:v1.0" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container image for Borzoi module" + }, + "containerOptions": { + "type": "string", + "description": "Container runtime options", + "default": "--gpus all --rm -v /mnt:/mnt", + "required": false, + "pipeline_io": "parameter", + "var_name": "params.containerOptions", + "examples": [ + "--gpus all --rm -v /mnt:/mnt", + "--rm -v /data:/data" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container runtime options for GPU usage and volume mounts" + }, + "project_name": { + "type": "string", + "description": "Project identifier", + "default": "test", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.project_name", + "examples": [ + "test", + "production", + "schizophrenia_study" + ], + "pattern": ".*", + "validation": {}, + "notes": "Identifier for the digital patient project" + }, + "mane": { + "type": "file", + "description": "Path to MANE reference file", + "default": "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv", + "required": true, + "pipeline_io": "input", + "var_name": "params.mane", + "examples": [ + "/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv" + ], + "pattern": ".*\\.tsv$", + "validation": {}, + "notes": "MANE transcripts reference file in TSV format" + }, + "// VCF2PROT PARAMETERS": {}, + "container_vcf2prot": { + "type": "string", + "description": "VCF2PROT container image", + "default": "vcf2prot:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_vcf2prot", + "examples": [ + "vcf2prot:latest", + "vcf2prot:v1.0" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container image for VCF2PROT module" + }, + "// RNA2PROTEINEXPRESSION PARAMETERS": {}, + "container_rna2protexpression": { + "type": "string", + "description": "RNA2PROTEINEXPRESSION container image", + "default": "rna2protexpression:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_rna2protexpression", + "examples": [ + "rna2protexpression:latest", + "rna2protexpression:v1.0" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container image for RNA2PROTEINEXPRESSION module" + }, + "containerOptions_rna2protexpression": { + "type": "string", + "description": "Container options for RNA2PROTEINEXPRESSION", + "default": "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs", + "required": false, + "pipeline_io": "parameter", + "var_name": "params.containerOptions_rna2protexpression", + "examples": [ + "--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container runtime options for RNA2PROTEINEXPRESSION with GPU and database volume mounts" + }, + "// CORTO PARAMETERS": {}, + "container_corto": { + "type": "string", + "description": "CORTO container image", + "default": "corto:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_corto", + "examples": [ + "corto:latest", + "corto:v1.0" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container image for CORTO module" + }, + "regulon": { + "type": "file", + "description": "Path to regulon RDA file", + "default": "/Workspace/next/registry/pipelines/digital_patient/regulon.rda", + "required": true, + "pipeline_io": "input", + "var_name": "params.regulon", + "examples": [ + "/Workspace/next/registry/pipelines/digital_patient/regulon.rda" + ], + "pattern": ".*\\.rda$", + "validation": {}, + "notes": "Regulon data file in RDA format for CORTO module" + }, + "// CIBERSORT PARAMETERS": {}, + "cibersortx_username": { + "type": "string", + "description": "CIBERSORTx username for authentication", + "default": "gabriel.richman.2009@anderson.ucla.edu", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.cibersortx_username", + "examples": [ + "gabriel.richman.2009@anderson.ucla.edu" + ], + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", + "validation": {}, + "notes": "Username (email) for CIBERSORTx authentication" + }, + "cibersortx_token": { + "type": "string", + "description": "CIBERSORTx authentication token", + "default": "b5b39e563fb947df4cfd4843d40fdb99", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.cibersortx_token", + "examples": [ + "b5b39e563fb947df4cfd4843d40fdb99" + ], + "pattern": "^[a-f0-9]{32}$", + "validation": {}, + "notes": "Authentication token for CIBERSORTx API access" + }, + "container_ecotyper": { + "type": "string", + "description": "ECOTyper container image", + "default": "ecotyper:latest", + "required": true, + "pipeline_io": "parameter", + "var_name": "params.container_ecotyper", + "examples": [ + "ecotyper:latest", + "ecotyper:v1.0" + ], + "pattern": ".*", + "validation": {}, + "notes": "Docker container image for ECOTyper module" + }, + "signature_matrix": { + "type": "file", + "description": "Path to signature matrix file", + "default": "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt", + "required": true, + "pipeline_io": "input", + "var_name": "params.signature_matrix", + "examples": [ + "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt" + ], + "pattern": ".*\\.txt$", + "validation": {}, + "notes": "Signature matrix file for CIBERSORTx analysis" + }, + "ecotyper_outdir": { + "type": "folder", + "description": "Output directory for ECOTyper results", + "default": "/mnt/omic-next-apis/wes/digital-patients/ecotyper", + "required": true, + "pipeline_io": "output", + "var_name": "params.ecotyper_outdir", + "examples": [ + "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper" + ], + "pattern": ".*", + "validation": {}, + "notes": "Directory where ECOTyper results will be stored" + } + } +} diff --git a/prot_bigger.csv b/prot_bigger.csv new file mode 100644 index 0000000..cb9f351 --- /dev/null +++ b/prot_bigger.csv @@ -0,0 +1,229 @@ +,Unnamed: 0,#NCBI_GeneID,Ensembl_Gene,HGNC_ID,symbol,name,RefSeq_nuc,RefSeq_prot,Ensembl_nuc,Ensembl_prot,MANE_status,GRCh38_chr,chr_start,chr_end,chr_strand,chrom +8141,8141,GeneID:23261,ENSG00000171735.21,HGNC:18806,CAMTA1,calmodulin binding transcription activator 1,NM_015215.4,NP_056030.1,ENST00000303635.12,ENSP00000306522.6,MANE Select,NC_000001.11,6785454,7769706,+,chr1 +14385,14385,GeneID:114784,ENSG00000121904.19,HGNC:19290,CSMD2,CUB and Sushi multiple domains 2,NM_001281956.2,NP_001268885.1,ENST00000373381.9,ENSP00000362479.4,MANE Select,NC_000001.11,33513998,34165230,-,chr1 +13673,13673,GeneID:84871,ENSG00000186094.18,HGNC:25892,AGBL4,AGBL carboxypeptidase 4,NM_032785.4,NP_116174.3,ENST00000371839.6,ENSP00000360905.1,MANE Select,NC_000001.11,48532854,50023954,-,chr1 +7622,7622,GeneID:11124,ENSG00000185104.21,HGNC:3578,FAF1,Fas associated factor 1,NM_007051.3,NP_008982.1,ENST00000396153.7,ENSP00000379457.2,MANE Select,NC_000001.11,50437028,50960267,-,chr1 +3420,3420,GeneID:5142,ENSG00000184588.18,HGNC:8781,PDE4B,phosphodiesterase 4B,NM_002600.4,NP_002591.2,ENST00000341517.9,ENSP00000342637.4,MANE Select,NC_000001.11,65792895,66374574,+,chr1 +11534,11534,GeneID:57554,ENSG00000033122.21,HGNC:18531,LRRC7,leucine rich repeat containing 7,NM_001370785.2,NP_001357714.1,ENST00000651989.2,ENSP00000498937.2,MANE Select,NC_000001.11,69567922,70144364,+,chr1 +16804,16804,GeneID:257194,ENSG00000172260.15,HGNC:17302,NEGR1,neuronal growth regulator 1,NM_173808.3,NP_776169.2,ENST00000357731.10,ENSP00000350364.4,MANE Select,NC_000001.11,71395943,72282539,-,chr1 +16775,16775,GeneID:256435,ENSG00000184005.12,HGNC:19343,ST6GALNAC3,"ST6 N-acetylgalactosaminide alpha-2,6-sialyltransferase 3",NM_152996.4,NP_694541.2,ENST00000328299.4,ENSP00000329214.3,MANE Select,NC_000001.11,76074746,76634603,+,chr1 +1263,1263,GeneID:1806,ENSG00000188641.14,HGNC:3012,DPYD,dihydropyrimidine dehydrogenase,NM_000110.4,NP_000101.2,ENST00000370192.8,ENSP00000359211.3,MANE Select,NC_000001.11,97077743,97921034,-,chr1 +8692,8692,GeneID:26052,ENSG00000197959.15,HGNC:29125,DNM3,dynamin 3,NM_015569.5,NP_056384.2,ENST00000627582.3,ENSP00000486701.1,MANE Select,NC_000001.11,171841498,172412717,+,chr1 +6668,6668,GeneID:9910,ENSG00000152061.24,HGNC:24663,RABGAP1L,RAB GTPase activating protein 1 like,NM_001366446.1,NP_001353375.1,ENST00000681986.1,ENSP00000507884.1,MANE Select,NC_000001.11,174159520,174995308,+,chr1 +5070,5070,GeneID:7399,ENSG00000042781.14,HGNC:12601,USH2A,usherin,NM_206933.4,NP_996816.3,ENST00000307340.8,ENSP00000305941.3,MANE Select,NC_000001.11,215622891,216423448,-,chr1 +4221,4221,GeneID:6262,ENSG00000198626.19,HGNC:10484,RYR2,ryanodine receptor 2,NM_001035.3,NP_001026.2,ENST00000366574.7,ENSP00000355533.2,MANE Select,NC_000001.11,237042184,237833988,+,chr1 +811,811,GeneID:1131,ENSG00000133019.12,HGNC:1952,CHRM3,cholinergic receptor muscarinic 3,NM_001375978.1,NP_001362907.1,ENST00000676153.1,ENSP00000502667.1,MANE Select,NC_000001.11,239386568,239915450,+,chr1 +4071,4071,GeneID:6000,ENSG00000182901.18,HGNC:10003,RGS7,regulator of G protein signaling 7,NM_001364886.1,NP_001351815.1,ENST00000440928.6,ENSP00000404399.2,MANE Select,NC_000001.11,240775514,241357205,-,chr1 +10459,10459,GeneID:55083,ENSG00000162849.16,HGNC:25484,KIF26B,kinesin family member 26B,NM_018012.4,NP_060482.2,ENST00000407071.7,ENSP00000385545.2,MANE Select,NC_000001.11,245154985,245709432,+,chr1 +12068,12068,GeneID:64754,ENSG00000185420.19,HGNC:15513,SMYD3,SET and MYND domain containing 3,NM_001167740.2,NP_001161212.1,ENST00000490107.6,ENSP00000419184.2,MANE Select,NC_000001.11,245749347,246507279,-,chr1 +67,67,GeneID:105,ENSG00000185736.16,HGNC:227,ADARB2,adenosine deaminase RNA specific B2 (inactive),NM_018702.4,NP_061172.1,ENST00000381312.6,ENSP00000370713.1,MANE Select,NC_000010.11,1177313,1737525,-,chr10 +10839,10839,GeneID:55691,ENSG00000151474.23,HGNC:25491,FRMD4A,FERM domain containing 4A,NM_018027.5,NP_060497.3,ENST00000357447.7,ENSP00000350032.2,MANE Select,NC_000010.11,13643706,14330924,-,chr10 +17391,17391,GeneID:340895,ENSG00000204740.11,HGNC:24331,MALRD1,MAM and LDL receptor class A domain containing 1,NM_001142308.3,NP_001135780.2,ENST00000454679.7,ENSP00000412763.3,MANE Select,NC_000010.11,19048801,19734478,+,chr10 +11129,11129,GeneID:56288,ENSG00000148498.17,HGNC:16051,PARD3,par-3 family cell polarity regulator,NM_001184785.2,NP_001171714.1,ENST00000374788.8,ENSP00000363920.3,MANE Select,NC_000010.11,34109561,34815296,-,chr10 +3761,3761,GeneID:5592,ENSG00000185532.20,HGNC:9414,PRKG1,protein kinase cGMP-dependent 1,NM_006258.4,NP_006249.1,ENST00000373980.11,ENSP00000363092.5,MANE Select,NC_000010.11,51074487,52298350,+,chr10 +12195,12195,GeneID:65217,ENSG00000150275.20,HGNC:14674,PCDH15,protocadherin related 15,NM_001384140.1,NP_001371069.1,ENST00000644397.2,ENSP00000495195.1,MANE Select,NC_000010.11,53802771,54801231,-,chr10 +12194,12194,GeneID:65217,ENSG00000150275.20,HGNC:14674,PCDH15,protocadherin related 15,NM_033056.4,NP_149045.3,ENST00000320301.11,ENSP00000322604.6,MANE Plus Clinical,NC_000010.11,53821099,54801231,-,chr10 +9209,9209,GeneID:29119,ENSG00000183230.18,HGNC:2511,CTNNA3,catenin alpha 3,NM_013266.4,NP_037398.2,ENST00000433211.7,ENSP00000389714.1,MANE Select,NC_000010.11,65912523,67696195,-,chr10 +89,89,GeneID:132,ENSG00000156110.15,HGNC:257,ADK,adenosine kinase,NM_006721.4,NP_006712.2,ENST00000539909.6,ENSP00000443965.2,MANE Select,NC_000010.11,74151221,74709290,+,chr10 +13265,13265,GeneID:83938,ENSG00000148655.16,HGNC:23405,LRMDA,leucine rich melanocyte differentiation associated,NM_001305581.2,NP_001292510.1,ENST00000611255.5,ENSP00000480240.1,MANE Select,NC_000010.11,75431624,76560168,+,chr10 +2556,2556,GeneID:3778,ENSG00000156113.25,HGNC:6284,KCNMA1,potassium calcium-activated channel subfamily M alpha 1,NM_001161352.2,NP_001154824.1,ENST00000286628.14,ENSP00000286628.8,MANE Select,NC_000010.11,76884882,77637808,-,chr10 +7315,7315,GeneID:10718,ENSG00000185737.13,HGNC:7999,NRG3,neuregulin 3,NM_001010848.4,NP_001010848.2,ENST00000372141.7,ENSP00000361214.2,MANE Select,NC_000010.11,81875194,82987178,+,chr10 +1955,1955,GeneID:2894,ENSG00000182771.19,HGNC:4575,GRID1,glutamate ionotropic receptor delta type subunit 1,NM_017551.3,NP_060021.1,ENST00000327946.12,ENSP00000330148.7,MANE Select,NC_000010.11,85599552,86366795,-,chr10 +11810,11810,GeneID:60495,ENSG00000172987.14,HGNC:18374,HPSE2,heparanase 2 (inactive),NM_021828.5,NP_068600.4,ENST00000370552.8,ENSP00000359583.3,MANE Select,NC_000010.11,98457077,99235852,-,chr10 +7919,7919,GeneID:22986,ENSG00000156395.14,HGNC:16699,SORCS3,sortilin related VPS10 domain containing receptor 3,NM_014978.3,NP_055793.1,ENST00000369701.8,ENSP00000358715.3,MANE Select,NC_000010.11,104641290,105265242,+,chr10 +14404,14404,GeneID:114815,ENSG00000108018.17,HGNC:16697,SORCS1,sortilin related VPS10 domain containing receptor 1,NM_052918.5,NP_443150.3,ENST00000263054.11,ENSP00000263054.5,MANE Select,NC_000010.11,106573663,107164706,-,chr10 +8676,8676,GeneID:26033,ENSG00000107518.18,HGNC:29063,ATRNL1,attractin like 1,NM_207303.4,NP_997186.1,ENST00000355044.8,ENSP00000347152.3,MANE Select,NC_000010.11,115093365,115948999,+,chr10 +1251,1251,GeneID:1793,ENSG00000150760.13,HGNC:2987,DOCK1,dedicator of cytokinesis 1,NM_001290223.2,NP_001277152.2,ENST00000623213.2,ENSP00000485033.1,MANE Select,NC_000010.11,126905428,127452516,+,chr10 +3152,3152,GeneID:4745,ENSG00000165973.19,HGNC:7750,NELL1,neural EGFL like 1,NM_006157.5,NP_006148.2,ENST00000357134.10,ENSP00000349654.5,MANE Select,NC_000011.10,20669586,21575686,+,chr11 +17268,17268,GeneID:338645,ENSG00000187398.12,HGNC:23206,LUZP2,leucine zipper protein 2,NM_001009909.4,NP_001009909.2,ENST00000336930.11,ENSP00000336817.6,MANE Select,NC_000011.10,24497053,25082638,+,chr11 +11635,11635,GeneID:57689,ENSG00000148948.8,HGNC:29317,LRRC4C,leucine rich repeat containing 4C,NM_001258419.2,NP_001245348.1,ENST00000528697.6,ENSP00000437132.1,MANE Select,NC_000011.10,40114209,41459652,-,chr11 +7899,7899,GeneID:22941,ENSG00000162105.21,HGNC:14295,SHANK2,SH3 and multiple ankyrin repeat domains 2,NM_012309.5,NP_036441.2,ENST00000601538.6,ENSP00000469689.2,MANE Select,NC_000011.10,70467854,71252577,-,chr11 +8661,8661,GeneID:26011,ENSG00000149256.16,HGNC:29945,TENM4,teneurin transmembrane protein 4,NM_001098816.3,NP_001092286.2,ENST00000278550.12,ENSP00000278550.7,MANE Select,NC_000011.10,78652829,79441030,-,chr11 +1213,1213,GeneID:1740,ENSG00000150672.19,HGNC:2901,DLG2,discs large MAGUK scaffold protein 2,NM_001142699.3,NP_001136171.1,ENST00000376104.7,ENSP00000365272.2,MANE Select,NC_000011.10,83455173,85627344,-,chr11 +1975,1975,GeneID:2915,ENSG00000168959.15,HGNC:4597,GRM5,glutamate metabotropic receptor 5,NM_001143831.3,NP_001137303.1,ENST00000305447.5,ENSP00000306138.4,MANE Select,NC_000011.10,88504642,89065982,-,chr11 +14645,14645,GeneID:120114,ENSG00000165323.16,HGNC:23112,FAT3,FAT atypical cadherin 3,NM_001367949.2,NP_001354878.1,ENST00000525166.6,ENSP00000432586.2,MANE Select,NC_000011.10,92224818,92896473,+,chr11 +10000,10000,GeneID:53942,ENSG00000149972.12,HGNC:2175,CNTN5,contactin 5,NM_014361.4,NP_055176.1,ENST00000524871.6,ENSP00000435637.1,MANE Select,NC_000011.10,99020949,100358885,+,chr11 +13556,13556,GeneID:84623,ENSG00000149571.12,HGNC:23204,KIRREL3,kirre like nephrin family adhesion molecule 3,NM_032531.4,NP_115920.1,ENST00000525144.7,ENSP00000435466.2,MANE Select,NC_000011.10,126423358,127000770,-,chr11 +9432,9432,GeneID:50863,ENSG00000182667.15,HGNC:17941,NTM,neurotrimin,NM_001352005.2,NP_001338934.1,ENST00000683400.1,ENSP00000507313.1,MANE Select,NC_000011.10,131370615,132336822,+,chr11 +3302,3302,GeneID:4978,ENSG00000183715.15,HGNC:8143,OPCML,opioid binding protein/cell adhesion molecule like,NM_001012393.5,NP_001012393.1,ENST00000524381.6,ENSP00000434750.1,MANE Select,NC_000011.10,132414981,133532501,-,chr11 +540,540,GeneID:775,ENSG00000151067.23,HGNC:1390,CACNA1C,calcium voltage-gated channel subunit alpha1 C,NM_001167623.2,NP_001161095.1,ENST00000399603.6,ENSP00000382512.1,MANE Plus Clinical,NC_000012.12,2052987,2697950,+,chr12 +541,541,GeneID:775,ENSG00000151067.23,HGNC:1390,CACNA1C,calcium voltage-gated channel subunit alpha1 C,NM_000719.7,NP_000710.5,ENST00000399655.6,ENSP00000382563.1,MANE Select,NC_000012.12,2052987,2697950,+,chr12 +13373,13373,GeneID:84216,ENSG00000139173.10,HGNC:25308,TMEM117,transmembrane protein 117,NM_032256.3,NP_115632.1,ENST00000266534.8,ENSP00000266534.3,MANE Select,NC_000012.12,43836081,44389758,+,chr12 +4697,4697,GeneID:6857,ENSG00000067715.15,HGNC:11509,SYT1,synaptotagmin 1,NM_005639.3,NP_005630.1,ENST00000261205.9,ENSP00000261205.4,MANE Select,NC_000012.12,78864774,79452008,+,chr12 +11203,11203,GeneID:56899,ENSG00000185046.21,HGNC:24600,ANKS1B,ankyrin repeat and sterile alpha motif domain containing 1B,NM_001352186.2,NP_001339115.1,ENST00000683438.2,ENSP00000508105.1,MANE Select,NC_000012.12,98743974,99984936,-,chr12 +14674,14674,GeneID:121256,ENSG00000151952.16,HGNC:29411,TMEM132D,transmembrane protein 132D,NM_133448.3,NP_597705.2,ENST00000422113.7,ENSP00000408581.2,MANE Select,NC_000012.12,129071726,129904025,-,chr12 +9935,9935,GeneID:51761,ENSG00000132932.19,HGNC:13533,ATP8A2,ATPase phospholipid transporting 8A2,NM_016529.6,NP_057613.4,ENST00000381655.7,ENSP00000371070.2,MANE Select,NC_000013.11,25371974,26025851,+,chr13 +8157,8157,GeneID:23281,ENSG00000132938.22,HGNC:20595,MTUS2,microtubule associated scaffold protein 2,NM_001033602.4,NP_001028774.3,ENST00000612955.6,ENSP00000483729.2,MANE Select,NC_000013.11,28820339,29505947,+,chr13 +8908,8908,GeneID:26960,ENSG00000172915.20,HGNC:7648,NBEA,neurobeachin,NM_001385012.1,NP_001371941.1,ENST00000379939.7,ENSP00000369271.2,MANE Select,NC_000013.11,34942270,35672736,+,chr13 +10447,10447,GeneID:55068,ENSG00000120658.14,HGNC:25474,ENOX1,ecto-NOX disulfide-thiol exchanger 1,NM_001347969.2,NP_001334898.1,ENST00000690772.1,ENSP00000509229.1,MANE Select,NC_000013.11,43213130,43786972,-,chr13 +3393,3393,GeneID:5101,ENSG00000184226.15,HGNC:8661,PCDH9,protocadherin 9,NM_203487.3,NP_982354.1,ENST00000377865.7,ENSP00000367096.2,MANE Select,NC_000013.11,66302834,67230336,-,chr13 +1587,1587,GeneID:2262,ENSG00000179399.16,HGNC:4453,GPC5,glypican 5,NM_004466.6,NP_004457.1,ENST00000377067.9,ENSP00000366267.3,MANE Select,NC_000013.11,91398621,92867237,+,chr13 +6808,6808,GeneID:10082,ENSG00000183098.12,HGNC:4454,GPC6,glypican 6,NM_005708.5,NP_005699.1,ENST00000377047.9,ENSP00000366246.3,MANE Select,NC_000013.11,93226807,94408020,+,chr13 +16853,16853,GeneID:266722,ENSG00000185352.10,HGNC:19134,HS6ST3,heparan sulfate 6-O-sulfotransferase 3,NM_153456.4,NP_703157.2,ENST00000376705.4,ENSP00000365895.2,MANE Select,NC_000013.11,96090107,96839562,+,chr13 +18768,18768,GeneID:728215,ENSG00000204442.4,HGNC:33877,NALF1,NALCN channel auxiliary factor 1,NM_001080396.3,NP_001073865.1,ENST00000375915.4,ENSP00000365080.1,MANE Select,NC_000013.11,107163510,107867496,-,chr13 +7949,7949,GeneID:23026,ENSG00000041515.16,HGNC:29822,MYO16,myosin XVI,NM_001198950.3,NP_001185879.1,ENST00000457511.7,ENSP00000401633.3,MANE Select,NC_000013.11,108629611,109208005,+,chr13 +11902,11902,GeneID:64067,ENSG00000151322.20,HGNC:19311,NPAS3,neuronal PAS domain protein 3,NM_001164749.2,NP_001158221.1,ENST00000356141.9,ENSP00000348460.4,MANE Select,NC_000014.9,32938879,33804173,+,chr14 +15945,15945,GeneID:161357,ENSG00000139915.22,HGNC:19835,MDGA2,MAM domain containing glycosylphosphatidylinositol anchor 2,NM_001113498.3,NP_001106970.4,ENST00000399232.8,ENSP00000382178.4,MANE Select,NC_000014.9,46840092,47675605,-,chr14 +6946,6946,GeneID:10243,ENSG00000171723.16,HGNC:15465,GPHN,gephyrin,NM_020806.5,NP_065857.1,ENST00000478722.6,ENSP00000417901.1,MANE Select,NC_000014.9,66508147,67181803,+,chr14 +3981,3981,GeneID:5890,ENSG00000182185.19,HGNC:9822,RAD51B,RAD51 paralog B,NM_133510.4,NP_598194.1,ENST00000471583.6,ENSP00000418859.1,MANE Select,NC_000014.9,67819779,68478093,+,chr14 +6421,6421,GeneID:9628,ENSG00000182732.18,HGNC:10002,RGS6,regulator of G protein signaling 6,NM_001204424.2,NP_001191353.1,ENST00000553525.6,ENSP00000451030.1,MANE Select,NC_000014.9,71932429,72566530,+,chr14 +6220,6220,GeneID:9369,ENSG00000021645.20,HGNC:8010,NRXN3,neurexin 3,NM_001330195.2,NP_001317124.1,ENST00000335750.7,ENSP00000338349.7,MANE Select,NC_000014.9,78170373,79868291,+,chr14 +1713,1713,GeneID:2567,ENSG00000182256.13,HGNC:4088,GABRG3,gamma-aminobutyric acid type A receptor subunit gamma3,NM_033223.5,NP_150092.2,ENST00000615808.5,ENSP00000479113.1,MANE Select,NC_000015.10,26971181,27541984,+,chr15 +4222,4222,GeneID:6263,ENSG00000198838.15,HGNC:10485,RYR3,ryanodine receptor 3,NM_001036.6,NP_001027.3,ENST00000634891.2,ENSP00000489262.1,MANE Select,NC_000015.10,33310967,33866102,+,chr15 +18310,18310,GeneID:440279,ENSG00000137766.18,HGNC:23149,UNC13C,unc-13 homolog C,NM_001080534.3,NP_001074003.1,ENST00000260323.16,ENSP00000260323.11,MANE Select,NC_000015.10,53978441,54628605,+,chr15 +4107,4107,GeneID:6095,ENSG00000069667.16,HGNC:10258,RORA,RAR related orphan receptor A,NM_134261.3,NP_599023.1,ENST00000335670.11,ENSP00000335087.6,MANE Select,NC_000015.10,60488284,61229302,-,chr15 +12646,12646,GeneID:79875,ENSG00000187720.15,HGNC:25835,THSD4,thrombospondin type 1 domain containing 4,NM_024817.3,NP_079093.2,ENST00000261862.8,ENSP00000261862.8,MANE Select,NC_000015.10,71115471,71783383,+,chr15 +9367,9367,GeneID:49855,ENSG00000140386.13,HGNC:13081,SCAPER,S-phase cyclin A associated protein in the ER,NM_020843.4,NP_065894.2,ENST00000563290.6,ENSP00000454973.1,MANE Select,NC_000015.10,76347904,76905340,-,chr15 +14748,14748,GeneID:123624,ENSG00000273540.5,HGNC:26504,AGBL1,AGBL carboxypeptidase 1,NM_001386094.1,NP_001373023.1,ENST00000614907.3,ENSP00000490608.2,MANE Select,NC_000015.10,86079871,86916156,+,chr15 +10183,10183,GeneID:54715,ENSG00000078328.23,HGNC:18222,RBFOX1,RNA binding fox-1 homolog 1,NM_018723.4,NP_061193.2,ENST00000550418.6,ENSP00000450031.1,MANE Select,NC_000016.10,6019024,7713340,+,chr16 +14117,14117,GeneID:92017,ENSG00000048471.14,HGNC:30542,SNX29,sorting nexin 29,NM_032167.5,NP_115543.3,ENST00000566228.6,ENSP00000456480.1,MANE Select,NC_000016.10,11976734,12574287,+,chr16 +9924,9924,GeneID:51741,ENSG00000186153.19,HGNC:12799,WWOX,WW domain containing oxidoreductase,NM_016373.4,NP_057457.1,ENST00000566780.6,ENSP00000457230.1,MANE Select,NC_000016.10,78099654,79212667,+,chr16 +728,728,GeneID:1012,ENSG00000140945.17,HGNC:1753,CDH13,cadherin 13,NM_001257.5,NP_001248.1,ENST00000567109.6,ENSP00000479395.1,MANE Select,NC_000016.10,82626969,83800640,+,chr16 +11231,11231,GeneID:56934,ENSG00000154975.14,HGNC:1369,CA10,carbonic anhydrase 10,NM_020178.5,NP_064563.1,ENST00000451037.7,ENSP00000405388.2,MANE Select,NC_000017.11,51630320,52158714,-,chr17 +10248,10248,GeneID:54828,ENSG00000141376.24,HGNC:14347,BCAS3,BCAS3 microtubule associated cell migration factor,NM_017679.5,NP_060149.3,ENST00000407086.8,ENSP00000385323.2,MANE Select,NC_000017.11,60677851,61392831,+,chr17 +16293,16293,GeneID:201134,ENSG00000154240.18,HGNC:28514,CEP112,centrosomal protein 112,NM_001199165.4,NP_001186094.1,ENST00000535342.7,ENSP00000442784.2,MANE Select,NC_000017.11,65635537,66192133,-,chr17 +6124,6124,GeneID:9229,ENSG00000170579.19,HGNC:2905,DLGAP1,DLG associated protein 1,NM_004746.4,NP_004737.2,ENST00000315677.8,ENSP00000316377.3,MANE Select,NC_000018.10,3496032,4455307,-,chr18 +3920,3920,GeneID:5797,ENSG00000173482.17,HGNC:9675,PTPRM,protein tyrosine phosphatase receptor type M,NM_001105244.2,NP_001098714.1,ENST00000580170.6,ENSP00000463325.1,MANE Select,NC_000018.10,7567316,8406856,+,chr18 +1152,1152,GeneID:1630,ENSG00000187323.13,HGNC:2701,DCC,DCC netrin 1 receptor,NM_005215.4,NP_005206.2,ENST00000442544.7,ENSP00000389140.2,MANE Select,NC_000018.10,52340197,53535899,+,chr18 +7962,7962,GeneID:23040,ENSG00000186487.21,HGNC:7623,MYT1L,myelin transcription factor 1 like,NM_001303052.2,NP_001289981.1,ENST00000647738.2,ENSP00000497479.2,MANE Select,NC_000002.12,1789113,2331275,-,chr2 +163,163,GeneID:238,ENSG00000171094.18,HGNC:427,ALK,ALK receptor tyrosine kinase,NM_004304.5,NP_004295.2,ENST00000389048.8,ENSP00000373700.3,MANE Select,NC_000002.12,29192774,29921586,-,chr2 +3750,3750,GeneID:5581,ENSG00000171132.14,HGNC:9401,PRKCE,protein kinase C epsilon,NM_005400.3,NP_005391.1,ENST00000306156.8,ENSP00000306124.3,MANE Select,NC_000002.12,45651675,46187990,+,chr2 +6229,6229,GeneID:9378,ENSG00000179915.25,HGNC:8008,NRXN1,neurexin 1,NM_001330078.2,NP_001317007.1,ENST00000401669.7,ENSP00000385017.2,MANE Select,NC_000002.12,49918503,51032132,-,chr2 +8117,8117,GeneID:23233,ENSG00000144036.16,HGNC:17085,EXOC6B,exocyst complex component 6B,NM_015189.3,NP_056004.1,ENST00000272427.11,ENSP00000272427.7,MANE Select,NC_000002.12,72175984,72826033,-,chr2 +12756,12756,GeneID:80059,ENSG00000176204.14,HGNC:19411,LRRTM4,leucine rich repeat transmembrane neuronal 4,NM_001134745.3,NP_001128217.1,ENST00000409884.6,ENSP00000387297.1,MANE Select,NC_000002.12,76747685,77522376,-,chr2 +1062,1062,GeneID:1496,ENSG00000066032.19,HGNC:2510,CTNNA2,catenin alpha 2,NM_001282597.3,NP_001269526.1,ENST00000402739.9,ENSP00000384638.4,MANE Select,NC_000002.12,79513051,80648780,+,chr2 +2648,2648,GeneID:3899,ENSG00000144218.21,HGNC:6473,AFF3,ALF transcription elongation factor 3,NM_001386135.1,NP_001373064.1,ENST00000672756.2,ENSP00000500419.1,MANE Select,NC_000002.12,99545419,100142590,-,chr2 +11595,11595,GeneID:57628,ENSG00000175497.17,HGNC:20823,DPP10,dipeptidyl peptidase like 10,NM_020868.6,NP_065919.3,ENST00000410059.6,ENSP00000386565.1,MANE Select,NC_000002.12,114442641,115845780,+,chr2 +15024,15024,GeneID:129684,ENSG00000155052.15,HGNC:18748,CNTNAP5,contactin associated protein family member 5,NM_001367498.1,NP_001354427.1,ENST00000682447.1,ENSP00000508115.1,MANE Select,NC_000002.12,124025287,124921219,+,chr2 +17475,17475,GeneID:344148,ENSG00000176771.18,HGNC:29847,NCKAP5,NCK associated protein 5,NM_207363.3,NP_997246.2,ENST00000409261.6,ENSP00000387128.1,MANE Select,NC_000002.12,132671788,133568463,-,chr2 +12907,12907,GeneID:80731,ENSG00000144229.13,HGNC:29348,THSD7B,thrombospondin type 1 domain containing 7B,NM_001316349.2,NP_001303278.1,ENST00000409968.6,ENSP00000387145.1,MANE Select,NC_000002.12,136765545,137677718,+,chr2 +9964,9964,GeneID:53353,ENSG00000168702.18,HGNC:6693,LRP1B,LDL receptor related protein 1B,NM_018557.3,NP_061027.2,ENST00000389484.8,ENSP00000374135.3,MANE Select,NC_000002.12,140231423,142131016,-,chr2 +10964,10964,GeneID:55843,ENSG00000075884.14,HGNC:21030,ARHGAP15,Rho GTPase activating protein 15,NM_018460.4,NP_060930.3,ENST00000295095.11,ENSP00000295095.6,MANE Select,NC_000002.12,143129419,143768352,+,chr2 +14403,14403,GeneID:114805,ENSG00000144278.15,HGNC:23242,GALNT13,polypeptide N-acetylgalactosaminyltransferase 13,NM_052917.4,NP_443149.2,ENST00000392825.8,ENSP00000376570.3,MANE Select,NC_000002.12,153871922,154453979,+,chr2 +5750,5750,GeneID:8708,ENSG00000172318.6,HGNC:916,B3GALT1,"beta-1,3-galactosyltransferase 1",NM_020981.4,NP_066191.1,ENST00000392690.4,ENSP00000376456.2,MANE Select,NC_000002.12,167293001,167874045,+,chr2 +14586,14586,GeneID:117583,ENSG00000116117.20,HGNC:14446,PARD3B,par-3 family cell polarity regulator beta,NM_001302769.2,NP_001289698.1,ENST00000406610.7,ENSP00000385848.2,MANE Select,NC_000002.12,204545475,205620162,+,chr2 +1447,1447,GeneID:2066,ENSG00000178568.16,HGNC:3432,ERBB4,erb-b2 receptor tyrosine kinase 4,NM_005235.3,NP_005226.1,ENST00000342788.9,ENSP00000342235.4,MANE Select,NC_000002.12,211375717,212538802,-,chr2 +12417,12417,GeneID:79582,ENSG00000144451.20,HGNC:23225,SPAG16,sperm associated antigen 16,NM_024532.5,NP_078808.3,ENST00000331683.10,ENSP00000332592.5,MANE Select,NC_000002.12,213284464,214410501,+,chr2 +14558,14558,GeneID:116987,ENSG00000157985.20,HGNC:16922,AGAP1,"ArfGAP with GTPase domain, ankyrin repeat and PH domain 1",NM_001037131.3,NP_001032208.1,ENST00000304032.13,ENSP00000307634.7,MANE Select,NC_000002.12,235494043,236131793,+,chr2 +8120,8120,GeneID:23236,ENSG00000182621.19,HGNC:15917,PLCB1,phospholipase C beta 1,NM_015192.4,NP_056007.1,ENST00000338037.11,ENSP00000338185.6,MANE Select,NC_000020.11,8132266,8884900,+,chr20 +15336,15336,GeneID:140733,ENSG00000172264.19,HGNC:16126,MACROD2,mono-ADP ribosylhydrolase 2,NM_001351661.2,NP_001338590.1,ENST00000684519.1,ENSP00000507484.1,MANE Select,NC_000020.11,13995516,16053197,+,chr20 +7620,7620,GeneID:11122,ENSG00000196090.14,HGNC:9682,PTPRT,protein tyrosine phosphatase receptor type T,NM_007050.6,NP_008981.4,ENST00000373187.6,ENSP00000362283.1,MANE Select,NC_000020.11,42072756,43189906,-,chr20 +14987,14987,GeneID:128553,ENSG00000182463.17,HGNC:13010,TSHZ2,teashirt zinc finger homeobox 2,NM_173485.6,NP_775756.3,ENST00000371497.10,ENSP00000360552.3,MANE Select,NC_000020.11,52972358,53495330,+,chr20 +719,719,GeneID:1002,ENSG00000179242.16,HGNC:1763,CDH4,cadherin 4,NM_001794.5,NP_001785.2,ENST00000614565.5,ENSP00000484928.1,MANE Select,NC_000020.11,61252261,61940617,+,chr20 +3101,3101,GeneID:4685,ENSG00000154654.15,HGNC:7657,NCAM2,neural cell adhesion molecule 2,NM_004540.5,NP_004531.2,ENST00000400546.6,ENSP00000383392.1,MANE Select,NC_000021.9,20998409,21543329,+,chr21 +1281,1281,GeneID:1826,ENSG00000171587.15,HGNC:3039,DSCAM,DS cell adhesion molecule,NM_001389.5,NP_001380.2,ENST00000400454.6,ENSP00000383303.1,MANE Select,NC_000021.9,40010999,40847158,-,chr21 +8194,8194,GeneID:23331,ENSG00000100154.15,HGNC:29179,TTC28,tetratricopeptide repeat domain 28,NM_001145418.2,NP_001138890.1,ENST00000397906.7,ENSP00000381003.2,MANE Select,NC_000022.11,27978014,28679840,-,chr22 +5416,5416,GeneID:8224,ENSG00000185666.16,HGNC:11496,SYN3,synapsin III,NM_003490.4,NP_003481.3,ENST00000358763.7,ENSP00000351614.2,MANE Select,NC_000022.11,32507820,33058381,-,chr22 +6116,6116,GeneID:9215,ENSG00000133424.22,HGNC:6511,LARGE1,LARGE xylosyl- and glucuronyltransferase 1,NM_133642.5,NP_598397.1,ENST00000397394.8,ENSP00000380549.2,MANE Select,NC_000022.11,33272509,33920476,-,chr22 +15759,15759,GeneID:152330,ENSG00000144619.15,HGNC:2174,CNTN4,contactin 4,NM_175607.3,NP_783200.1,ENST00000418658.6,ENSP00000396010.1,MANE Select,NC_000003.12,2098866,3057959,+,chr3 +1977,1977,GeneID:2917,ENSG00000196277.17,HGNC:4599,GRM7,glutamate metabotropic receptor 7,NM_000844.4,NP_000835.1,ENST00000357716.9,ENSP00000350348.4,MANE Select,NC_000003.12,6861115,7741533,+,chr3 +6555,6555,GeneID:9779,ENSG00000131374.15,HGNC:19166,TBC1D5,TBC1 domain family member 5,NM_001349074.2,NP_001336003.1,ENST00000696125.1,ENSP00000512418.1,MANE Select,NC_000003.12,17157168,17742631,-,chr3 +9071,9071,GeneID:27303,ENSG00000144642.22,HGNC:13427,RBMS3,RNA binding motif single stranded interacting protein 3,NM_001003793.3,NP_001003793.1,ENST00000383767.7,ENSP00000373277.2,MANE Select,NC_000003.12,29281071,30010386,+,chr3 +10385,10385,GeneID:54986,ENSG00000168038.11,HGNC:15784,ULK4,unc-51 like kinase 4,NM_017886.4,NP_060356.2,ENST00000301831.9,ENSP00000301831.4,MANE Select,NC_000003.12,41246599,41962103,-,chr3 +1253,1253,GeneID:1795,ENSG00000088538.13,HGNC:2989,DOCK3,dedicator of cytokinesis 3,NM_004947.5,NP_004938.1,ENST00000266037.10,ENSP00000266037.8,MANE Select,NC_000003.12,50674927,51384198,+,chr3 +10932,10932,GeneID:55799,ENSG00000157445.16,HGNC:15460,CACNA2D3,calcium voltage-gated channel auxiliary subunit alpha2delta 3,NM_018398.3,NP_060868.2,ENST00000474759.6,ENSP00000419101.1,MANE Select,NC_000003.12,54122552,55074557,+,chr3 +8698,8698,GeneID:26059,ENSG00000187672.15,HGNC:31922,ERC2,ELKS/RAB6-interacting/CAST family member 2,NM_015576.3,NP_056391.1,ENST00000288221.11,ENSP00000288221.6,MANE Select,NC_000003.12,55508311,56468467,-,chr3 +1595,1595,GeneID:2272,ENSG00000189283.11,HGNC:3701,FHIT,fragile histidine triad diadenosine triphosphatase,NM_002012.4,NP_002003.1,ENST00000492590.6,ENSP00000418582.1,MANE Select,NC_000003.12,59747277,61251452,-,chr3 +3916,3916,GeneID:5793,ENSG00000144724.20,HGNC:9671,PTPRG,protein tyrosine phosphatase receptor type G,NM_002841.4,NP_002832.3,ENST00000474889.6,ENSP00000418112.1,MANE Select,NC_000003.12,61561571,62297609,+,chr3 +6121,6121,GeneID:9223,ENSG00000151276.24,HGNC:946,MAGI1,"membrane associated guanylate kinase, WW and PDZ domain containing 1",NM_001033057.2,NP_001028229.1,ENST00000402939.7,ENSP00000385450.2,MANE Select,NC_000003.12,65353526,66038918,-,chr3 +18262,18262,GeneID:407738,ENSG00000183662.12,HGNC:21587,TAFA1,TAFA chemokine like family member 1,NM_213609.4,NP_998774.2,ENST00000478136.6,ENSP00000418575.1,MANE Select,NC_000003.12,68004247,68545621,+,chr3 +8957,8957,GeneID:27086,ENSG00000114861.24,HGNC:3823,FOXP1,forkhead box P1,NM_001349338.3,NP_001336267.1,ENST00000649528.3,ENSP00000497369.1,MANE Select,NC_000003.12,70954708,71583728,-,chr3 +4104,4104,GeneID:6092,ENSG00000185008.19,HGNC:10250,ROBO2,roundabout guidance receptor 2,NM_001395656.1,NP_001382585.1,ENST00000696593.1,ENSP00000512738.1,MANE Select,NC_000003.12,77040099,77649964,+,chr3 +4103,4103,GeneID:6091,ENSG00000169855.21,HGNC:10249,ROBO1,roundabout guidance receptor 1,NM_002941.4,NP_002932.1,ENST00000464233.6,ENSP00000420321.1,MANE Select,NC_000003.12,78597239,79767998,-,chr3 +16659,16659,GeneID:253559,ENSG00000175161.14,HGNC:29849,CADM2,cell adhesion molecule 2,NM_001167675.2,NP_001161147.1,ENST00000383699.8,ENSP00000373200.3,MANE Select,NC_000003.12,84958989,86074429,+,chr3 +17100,17100,GeneID:285220,ENSG00000080224.18,HGNC:19296,EPHA6,EPH receptor A6,NM_001080448.3,NP_001073917.2,ENST00000389672.10,ENSP00000374323.5,MANE Select,NC_000003.12,96814594,97761532,+,chr3 +8736,8736,GeneID:26137,ENSG00000181722.18,HGNC:13503,ZBTB20,zinc finger and BTB domain containing 20,NM_001348800.3,NP_001335729.1,ENST00000675478.1,ENSP00000501561.1,MANE Select,NC_000003.12,114314500,115147288,-,chr3 +2743,2743,GeneID:4045,ENSG00000185565.13,HGNC:6705,LSAMP,limbic system associated membrane protein,NM_002338.5,NP_002329.2,ENST00000490035.7,ENSP00000419000.1,MANE Select,NC_000003.12,115802374,116445487,-,chr3 +5957,5957,GeneID:8997,ENSG00000160145.16,HGNC:4814,KALRN,kalirin RhoGEF kinase,NM_001388419.1,NP_001375348.1,ENST00000682506.1,ENSP00000508359.1,MANE Select,NC_000003.12,124033369,124726325,+,chr3 +11910,11910,GeneID:64084,ENSG00000158258.16,HGNC:17448,CLSTN2,calsyntenin 2,NM_022131.3,NP_071414.2,ENST00000458420.7,ENSP00000402460.2,MANE Select,NC_000003.12,139935185,140577397,+,chr3 +17098,17098,GeneID:285195,ENSG00000181804.15,HGNC:20653,SLC9A9,solute carrier family 9 member A9,NM_173653.4,NP_775924.1,ENST00000316549.11,ENSP00000320246.6,MANE Select,NC_000003.12,143265222,143848468,-,chr3 +9309,9309,GeneID:29970,ENSG00000151967.19,HGNC:15678,SCHIP1,schwannomin interacting protein 1,NM_014575.4,NP_055390.1,ENST00000638749.2,ENSP00000491030.1,MANE Select,NC_000003.12,159273244,159897359,+,chr3 +1482,1482,GeneID:2122,ENSG00000085276.19,HGNC:3498,MECOM,MDS1 and EVI1 complex locus,NM_004991.4,NP_004982.2,ENST00000651503.2,ENSP00000498411.1,MANE Select,NC_000003.12,169083507,169663712,-,chr3 +7839,7839,GeneID:22871,ENSG00000169760.18,HGNC:14291,NLGN1,neuroligin 1,NM_001365925.2,NP_001352854.1,ENST00000695368.1,ENSP00000511841.1,MANE Select,NC_000003.12,173397744,174294372,+,chr3 +16707,16707,GeneID:254827,ENSG00000177694.16,HGNC:23219,NAALADL2,N-acetylated alpha-linked acidic dipeptidase like 2,NM_207015.3,NP_996898.2,ENST00000454872.6,ENSP00000404705.1,MANE Select,NC_000003.12,174859334,175810548,+,chr3 +2733,2733,GeneID:4026,ENSG00000145012.15,HGNC:6679,LPP,LIM domain containing preferred translocation partner in lipoma,NM_001375462.1,NP_001362391.1,ENST00000617246.5,ENSP00000478901.1,MANE Select,NC_000003.12,188154157,188890671,+,chr3 +1582,1582,GeneID:2257,ENSG00000114279.15,HGNC:3668,FGF12,fibroblast growth factor 12,NM_004113.6,NP_004104.3,ENST00000445105.7,ENSP00000393686.1,MANE Select,NC_000003.12,192139390,192727541,-,chr3 +11519,11519,GeneID:57537,ENSG00000184985.17,HGNC:16698,SORCS2,sortilin related VPS10 domain containing receptor 2,NM_020777.3,NP_065828.2,ENST00000507866.6,ENSP00000422185.2,MANE Select,NC_000004.12,7192538,7742827,+,chr4 +12879,12879,GeneID:80333,ENSG00000185774.17,HGNC:30083,KCNIP4,potassium voltage-gated channel interacting protein 4,NM_025221.6,NP_079497.2,ENST00000382152.7,ENSP00000371587.2,MANE Select,NC_000004.12,20728606,21948772,-,chr4 +8159,8159,GeneID:23284,ENSG00000150471.17,HGNC:20974,ADGRL3,adhesion G protein-coupled receptor L3,NM_001387552.1,NP_001374481.1,ENST00000683033.1,ENSP00000507980.1,MANE Select,NC_000004.12,61200326,62078335,+,chr4 +16722,16722,GeneID:255119,ENSG00000197826.13,HGNC:28554,CFAP299,cilia and flagella associated protein 299,NM_152770.3,NP_689983.2,ENST00000358105.8,ENSP00000350818.3,MANE Select,NC_000004.12,80335730,80963750,+,chr4 +13132,13132,GeneID:83478,ENSG00000138639.18,HGNC:25361,ARHGAP24,Rho GTPase activating protein 24,NM_001025616.3,NP_001020787.2,ENST00000395184.6,ENSP00000378611.1,MANE Select,NC_000004.12,85475150,86002666,+,chr4 +18184,18184,GeneID:401145,ENSG00000184305.16,HGNC:29349,CCSER1,coiled-coil serine rich protein 1,NM_001145065.2,NP_001138537.1,ENST00000509176.6,ENSP00000425040.1,MANE Select,NC_000004.12,90127394,91605295,+,chr4 +1956,1956,GeneID:2895,ENSG00000152208.13,HGNC:4576,GRID2,glutamate ionotropic receptor delta type subunit 2,NM_001510.4,NP_001501.2,ENST00000282020.9,ENSP00000282020.4,MANE Select,NC_000004.12,92303966,93774566,+,chr4 +17131,17131,GeneID:285555,ENSG00000163116.10,HGNC:28712,STPG2,sperm tail PG-rich repeat containing 2,NM_174952.3,NP_777612.1,ENST00000295268.4,ENSP00000295268.3,MANE Select,NC_000004.12,97558872,98143476,-,chr4 +5837,5837,GeneID:8821,ENSG00000109452.13,HGNC:6075,INPP4B,inositol polyphosphate-4-phosphatase type II B,NM_001101669.3,NP_001095139.1,ENST00000262992.9,ENSP00000262992.4,MANE Select,NC_000004.12,142023160,142846301,-,chr4 +705,705,GeneID:987,ENSG00000198589.15,HGNC:1742,LRBA,LPS responsive beige-like anchor protein,NM_001364905.1,NP_001351834.1,ENST00000651943.2,ENSP00000498582.2,MANE Select,NC_000004.12,150264435,151015284,-,chr4 +11190,11190,GeneID:56884,ENSG00000168843.14,HGNC:21386,FSTL5,follistatin like 5,NM_020116.5,NP_064501.2,ENST00000306100.10,ENSP00000305334.4,MANE Select,NC_000004.12,161383897,162164000,-,chr4 +10412,10412,GeneID:55016,ENSG00000145416.14,HGNC:26077,MARCHF1,membrane associated ring-CH-type finger 1,NM_001394959.1,NP_001381888.1,ENST00000514618.6,ENSP00000421322.1,MANE Select,NC_000004.12,163524718,164384019,-,chr4 +18404,18404,GeneID:442117,ENSG00000174473.17,HGNC:33844,GALNTL6,polypeptide N-acetylgalactosaminyltransferase like 6,NM_001034845.3,NP_001030017.2,ENST00000506823.6,ENSP00000423313.1,MANE Select,NC_000004.12,171813404,173041559,+,chr4 +10858,10858,GeneID:55714,ENSG00000218336.9,HGNC:29944,TENM3,teneurin transmembrane protein 3,NM_001080477.4,NP_001073946.1,ENST00000511685.6,ENSP00000424226.1,MANE Select,NC_000004.12,182243402,182803024,+,chr4 +1066,1066,GeneID:1501,ENSG00000169862.21,HGNC:2516,CTNND2,catenin delta 2,NM_001332.4,NP_001323.1,ENST00000304623.13,ENSP00000307134.8,MANE Select,NC_000005.10,10971836,11904446,-,chr5 +727,727,GeneID:1010,ENSG00000154162.15,HGNC:1751,CDH12,cadherin 12,NM_004061.5,NP_004052.2,ENST00000382254.6,ENSP00000371689.1,MANE Select,NC_000005.10,21750673,22853344,-,chr5 +3422,3422,GeneID:5144,ENSG00000113448.21,HGNC:8783,PDE4D,phosphodiesterase 4D,NM_001104631.2,NP_001098101.1,ENST00000340635.11,ENSP00000345502.6,MANE Select,NC_000005.10,58969038,59893726,-,chr5 +17704,17704,GeneID:375449,ENSG00000069020.19,HGNC:19037,MAST4,microtubule associated serine/threonine kinase family member 4,NM_001164664.2,NP_001158136.1,ENST00000403625.7,ENSP00000385727.1,MANE Select,NC_000005.10,66596393,67169593,+,chr5 +13291,13291,GeneID:84059,ENSG00000164199.18,HGNC:17416,ADGRV1,adhesion G protein-coupled receptor V1,NM_032119.4,NP_115495.3,ENST00000405460.9,ENSP00000384582.2,MANE Select,NC_000005.10,90558797,91164437,+,chr5 +12563,12563,GeneID:79772,ENSG00000175471.20,HGNC:26183,MCTP1,multiple C2 and transmembrane domain containing 1,NM_024717.7,NP_078993.4,ENST00000515393.6,ENSP00000424126.1,MANE Select,NC_000005.10,94703690,95285094,-,chr5 +12114,12114,GeneID:64839,ENSG00000145743.18,HGNC:13615,FBXL17,F-box and leucine rich repeat protein 17,NM_001163315.3,NP_001156787.2,ENST00000542267.7,ENSP00000437464.2,MANE Select,NC_000005.10,107859035,108382098,-,chr5 +4572,4572,GeneID:6695,ENSG00000152377.15,HGNC:11251,SPOCK1,"SPARC (osteonectin), cwcv and kazal like domains proteoglycan 1",NM_004598.4,NP_004589.1,ENST00000394945.6,ENSP00000378401.1,MANE Select,NC_000005.10,136975298,137499326,-,chr5 +11447,11447,GeneID:57451,ENSG00000145934.17,HGNC:29943,TENM2,teneurin transmembrane protein 2,NM_001395460.1,NP_001382389.1,ENST00000518659.6,ENSP00000429430.1,MANE Select,NC_000005.10,166979029,168264157,+,chr5 +4480,4480,GeneID:6586,ENSG00000184347.16,HGNC:11087,SLIT3,slit guidance ligand 3,NM_003062.4,NP_003053.2,ENST00000519560.6,ENSP00000430333.2,MANE Select,NC_000005.10,168661740,169301139,-,chr5 +1839,1839,GeneID:2762,ENSG00000112699.11,HGNC:4369,GMDS,"GDP-mannose 4,6-dehydratase",NM_001500.4,NP_001491.1,ENST00000380815.5,ENSP00000370194.4,MANE Select,NC_000006.12,1623806,2245605,-,chr6 +16547,16547,GeneID:221692,ENSG00000112137.19,HGNC:20990,PHACTR1,phosphatase and actin regulator 1,NM_030948.6,NP_112210.1,ENST00000332995.12,ENSP00000329880.8,MANE Select,NC_000006.12,12716767,13287837,+,chr6 +10312,10312,GeneID:54901,ENSG00000145996.11,HGNC:21050,CDKAL1,CDK5 regulatory subunit associated protein 1 like 1,NM_017774.3,NP_060244.2,ENST00000274695.8,ENSP00000274695.4,MANE Select,NC_000006.12,20534457,21232404,+,chr6 +5564,5564,GeneID:8464,ENSG00000196284.18,HGNC:11466,SUPT3H,"SPT3 homolog, SAGA and STAGA complex component",NM_003599.4,NP_003590.1,ENST00000371459.6,ENSP00000360514.1,MANE Select,NC_000006.12,44826726,45377934,-,chr6 +16340,16340,GeneID:202559,ENSG00000112232.10,HGNC:18114,KHDRBS2,"KH RNA binding domain containing, signal transduction associated 2",NM_152688.4,NP_689901.2,ENST00000281156.5,ENSP00000281156.3,MANE Select,NC_000006.12,61679961,62286225,-,chr6 +17510,17510,GeneID:346007,ENSG00000188107.15,HGNC:21555,EYS,eyes shut homolog,NM_001142800.2,NP_001136272.1,ENST00000503581.6,ENSP00000424243.1,MANE Select,NC_000006.12,63719980,65707226,-,chr6 +393,393,GeneID:577,ENSG00000135298.15,HGNC:945,ADGRB3,adhesion G protein-coupled receptor B3,NM_001704.3,NP_001695.2,ENST00000370598.6,ENSP00000359630.1,MANE Select,NC_000006.12,68635282,69389506,+,chr6 +11143,11143,GeneID:56479,ENSG00000185760.18,HGNC:6299,KCNQ5,potassium voltage-gated channel subfamily Q member 5,NM_019842.4,NP_062816.2,ENST00000370398.6,ENSP00000359425.1,MANE Select,NC_000006.12,72622064,73198853,+,chr6 +1959,1959,GeneID:2898,ENSG00000164418.22,HGNC:4580,GRIK2,glutamate ionotropic receptor kainate type subunit 2,NM_021956.5,NP_068775.1,ENST00000369134.9,ENSP00000358130.6,MANE Select,NC_000006.12,101393708,102070083,+,chr6 +15816,15816,GeneID:154215,ENSG00000188580.16,HGNC:16443,NKAIN2,sodium/potassium transporting ATPase interacting 2,NM_001040214.3,NP_001035304.1,ENST00000368417.6,ENSP00000357402.1,MANE Select,NC_000006.12,123803865,124825640,+,chr6 +3919,3919,GeneID:5796,ENSG00000152894.15,HGNC:9674,PTPRK,protein tyrosine phosphatase receptor type K,NM_002844.4,NP_002835.2,ENST00000368226.9,ENSP00000357209.4,MANE Select,NC_000006.12,127968785,128520599,-,chr6 +2653,2653,GeneID:3908,ENSG00000196569.14,HGNC:6482,LAMA2,laminin subunit alpha 2,NM_000426.4,NP_000417.3,ENST00000421865.3,ENSP00000400365.2,MANE Select,NC_000006.12,128883138,129516566,+,chr6 +5072,5072,GeneID:7402,ENSG00000152818.20,HGNC:12635,UTRN,utrophin,NM_007124.3,NP_009055.2,ENST00000367545.8,ENSP00000356515.3,MANE Select,NC_000006.12,144285335,144853034,+,chr6 +3368,3368,GeneID:5071,ENSG00000185345.25,HGNC:8607,PRKN,parkin RBR E3 ubiquitin protein ligase,NM_004562.3,NP_004553.2,ENST00000366898.6,ENSP00000355865.1,MANE Select,NC_000006.12,161347417,162727766,-,chr6 +15178,15178,GeneID:135138,ENSG00000112530.13,HGNC:19152,PACRG,parkin coregulated,NM_001080379.2,NP_001073848.1,ENST00000366888.7,ENSP00000355854.2,MANE Select,NC_000006.12,162727974,163315500,+,chr6 +16561,16561,GeneID:221935,ENSG00000146555.20,HGNC:19307,SDK1,sidekick cell adhesion molecule 1,NM_152744.4,NP_689957.3,ENST00000404826.7,ENSP00000385899.2,MANE Select,NC_000007.14,3301252,4269000,+,chr7 +1135,1135,GeneID:1607,ENSG00000136267.14,HGNC:2850,DGKB,diacylglycerol kinase beta,NM_001350709.2,NP_001337638.1,ENST00000402815.6,ENSP00000384909.1,MANE Select,NC_000007.14,14145049,14902751,-,chr7 +6610,6610,GeneID:9844,ENSG00000155849.16,HGNC:16286,ELMO1,engulfment and cell motility 1,NM_014800.11,NP_055615.8,ENST00000310758.9,ENSP00000312185.4,MANE Select,NC_000007.14,36852906,37448864,-,chr7 +12571,12571,GeneID:79783,ENSG00000175600.16,HGNC:16001,SUGCT,succinyl-CoA:glutarate-CoA transferase,NM_001193313.2,NP_001180242.2,ENST00000335693.9,ENSP00000338475.5,MANE Select,NC_000007.14,40135005,40860763,+,chr7 +8693,8693,GeneID:26053,ENSG00000158321.19,HGNC:14262,AUTS2,activator of transcription and developmental regulator AUTS2,NM_015570.4,NP_056385.1,ENST00000342771.10,ENSP00000344087.4,MANE Select,NC_000007.14,69598475,70793506,+,chr7 +12012,12012,GeneID:64409,ENSG00000185274.13,HGNC:16347,GALNT17,polypeptide N-acetylgalactosaminyltransferase 17,NM_022479.3,NP_071924.1,ENST00000333538.10,ENSP00000329654.5,MANE Select,NC_000007.14,71132144,71713599,+,chr7 +13188,13188,GeneID:83698,ENSG00000183166.12,HGNC:13248,CALN1,calneuron 1,NM_031468.4,NP_113656.2,ENST00000395275.7,ENSP00000378690.2,MANE Select,NC_000007.14,71779491,72412338,-,chr7 +6627,6627,GeneID:9863,ENSG00000187391.23,HGNC:18957,MAGI2,"membrane associated guanylate kinase, WW and PDZ domain containing 2",NM_012301.4,NP_036433.2,ENST00000354212.9,ENSP00000346151.4,MANE Select,NC_000007.14,78017055,79453667,-,chr7 +16419,16419,GeneID:219578,ENSG00000182348.7,HGNC:21958,ZNF804B,zinc finger protein 804B,NM_181646.5,NP_857597.1,ENST00000333190.5,ENSP00000329638.4,MANE Select,NC_000007.14,88759700,89338528,+,chr7 +3485,3485,GeneID:5218,ENSG00000058091.17,HGNC:8883,CDK14,cyclin dependent kinase 14,NM_001287135.2,NP_001274064.1,ENST00000380050.8,ENSP00000369390.3,MANE Select,NC_000007.14,90596321,91210590,+,chr7 +17711,17711,GeneID:375612,ENSG00000187416.13,HGNC:6589,LHFPL3,LHFPL tetraspan subfamily member 3,NM_199000.3,NP_945351.1,ENST00000424859.7,ENSP00000393128.2,MANE Select,NC_000007.14,104328603,104908561,+,chr7 +13270,13270,GeneID:83943,ENSG00000184903.10,HGNC:14598,IMMP2L,inner mitochondrial membrane peptidase subunit 2,NM_032549.4,NP_115938.1,ENST00000405709.7,ENSP00000384966.2,MANE Select,NC_000007.14,110662644,111562492,-,chr7 +14243,14243,GeneID:93664,ENSG00000081803.17,HGNC:16018,CADPS2,calcium dependent secretion activator 2,NM_017954.11,NP_060424.9,ENST00000449022.7,ENSP00000398481.2,MANE Select,NC_000007.14,122318411,122886460,-,chr7 +1978,1978,GeneID:2918,ENSG00000179603.19,HGNC:4600,GRM8,glutamate metabotropic receptor 8,NM_000845.3,NP_000836.2,ENST00000339582.7,ENSP00000344173.2,MANE Select,NC_000007.14,126438598,127252941,-,chr7 +11794,11794,GeneID:60412,ENSG00000131558.15,HGNC:30389,EXOC4,exocyst complex component 4,NM_021807.4,NP_068579.3,ENST00000253861.5,ENSP00000253861.4,MANE Select,NC_000007.14,133253078,134065761,+,chr7 +8687,8687,GeneID:26047,ENSG00000174469.23,HGNC:13830,CNTNAP2,contactin associated protein 2,NM_014141.6,NP_054860.1,ENST00000361727.8,ENSP00000354778.3,MANE Select,NC_000007.14,146116801,148420998,+,chr7 +1261,1261,GeneID:1804,ENSG00000130226.18,HGNC:3010,DPP6,dipeptidyl peptidase like 6,NM_130797.4,NP_570629.2,ENST00000377770.8,ENSP00000367001.3,MANE Select,NC_000007.14,154052398,154894285,+,chr7 +3922,3922,GeneID:5799,ENSG00000155093.20,HGNC:9677,PTPRN2,protein tyrosine phosphatase receptor type N2,NM_002847.5,NP_002838.2,ENST00000389418.9,ENSP00000374069.4,MANE Select,NC_000007.14,157539056,158587823,-,chr7 +6123,6123,GeneID:9228,ENSG00000198010.13,HGNC:2906,DLGAP2,DLG associated protein 2,NM_001346810.2,NP_001333739.1,ENST00000637795.2,ENSP00000489774.1,MANE Select,NC_000008.11,737628,1708476,+,chr8 +12033,12033,GeneID:64478,ENSG00000183117.20,HGNC:14026,CSMD1,CUB and Sushi multiple domains 1,NM_033225.6,NP_150094.5,ENST00000635120.2,ENSP00000489225.1,MANE Select,NC_000008.11,2935361,4994914,-,chr8 +15225,15225,GeneID:137868,ENSG00000185053.14,HGNC:14075,SGCZ,sarcoglycan zeta,NM_139167.4,NP_631906.2,ENST00000382080.6,ENSP00000371512.1,MANE Select,NC_000008.11,14084845,15238431,-,chr8 +15230,15230,GeneID:137970,ENSG00000156687.11,HGNC:18634,UNC5D,unc-5 netrin receptor D,NM_080872.4,NP_543148.2,ENST00000404895.7,ENSP00000385143.2,MANE Select,NC_000008.11,35235475,35796540,+,chr8 +10032,10032,GeneID:54212,ENSG00000147481.17,HGNC:13740,SNTG1,syntrophin gamma 1,NM_018967.5,NP_061840.1,ENST00000642720.2,ENSP00000493900.1,MANE Select,NC_000008.11,49911407,50796692,+,chr8 +17175,17175,GeneID:286183,ENSG00000185942.13,HGNC:26829,NKAIN3,sodium/potassium transporting ATPase interacting 3,NM_001304533.3,NP_001291462.1,ENST00000623646.3,ENSP00000501908.1,MANE Select,NC_000008.11,62248854,62984904,+,chr8 +15233,15233,GeneID:138046,ENSG00000184672.12,HGNC:27036,RALYL,RALY RNA binding protein like,NM_173848.7,NP_776247.3,ENST00000521268.6,ENSP00000430367.1,MANE Select,NC_000008.11,84183274,84921844,+,chr8 +15853,15853,GeneID:157680,ENSG00000132549.20,HGNC:2183,VPS13B,vacuolar protein sorting 13 homolog B,NM_017890.5,NP_060360.3,ENST00000358544.7,ENSP00000351346.2,MANE Plus Clinical,NC_000008.11,99013274,99877580,+,chr8 +15852,15852,GeneID:157680,ENSG00000132549.20,HGNC:2183,VPS13B,vacuolar protein sorting 13 homolog B,NM_152564.5,NP_689777.3,ENST00000357162.7,ENSP00000349685.2,MANE Select,NC_000008.11,99013274,99877580,+,chr8 +6483,6483,GeneID:9699,ENSG00000176406.25,HGNC:17283,RIMS2,regulating synaptic membrane exocytosis 2,NM_001348484.3,NP_001335413.1,ENST00000696799.1,ENSP00000512879.1,MANE Select,NC_000008.11,103500610,104254430,+,chr8 +14389,14389,GeneID:114788,ENSG00000164796.18,HGNC:19291,CSMD3,CUB and Sushi multiple domains 3,NM_198123.2,NP_937756.1,ENST00000297405.10,ENSP00000297405.5,MANE Select,NC_000008.11,112222928,113436939,-,chr8 +13186,13186,GeneID:83696,ENSG00000167632.18,HGNC:30832,TRAPPC9,trafficking protein particle complex subunit 9,NM_001160372.4,NP_001153844.1,ENST00000438773.4,ENSP00000405060.3,MANE Select,NC_000008.11,139727725,140457744,-,chr8 +3912,3912,GeneID:5789,ENSG00000153707.19,HGNC:9668,PTPRD,protein tyrosine phosphatase receptor type D,NM_002839.4,NP_002830.1,ENST00000381196.9,ENSP00000370593.3,MANE Select,NC_000009.12,8314246,10613002,-,chr9 +15866,15866,GeneID:158038,ENSG00000174482.11,HGNC:21207,LINGO2,leucine rich repeat and Ig domain containing 2,NM_001258282.3,NP_001245211.1,ENST00000698399.1,ENSP00000513694.1,MANE Select,NC_000009.12,27937617,29213601,-,chr9 +12753,12753,GeneID:80036,ENSG00000083067.27,HGNC:17992,TRPM3,transient receptor potential cation channel subfamily M member 3,NM_001366145.2,NP_001353074.1,ENST00000677713.2,ENSP00000503830.2,MANE Select,NC_000009.12,70529060,71121621,-,chr9 +8128,8128,GeneID:23245,ENSG00000148219.18,HGNC:17021,ASTN2,astrotactin 2,NM_001365068.1,NP_001351997.1,ENST00000313400.9,ENSP00000314038.4,MANE Select,NC_000009.12,116423112,117415057,-,chr9 +11650,11650,GeneID:57706,ENSG00000119522.18,HGNC:29324,DENND1A,DENN domain containing 1A,NM_001352964.2,NP_001339893.1,ENST00000394215.7,ENSP00000377763.4,MANE Select,NC_000009.12,123379658,123930126,-,chr9 +283,283,GeneID:395,ENSG00000047648.23,HGNC:676,ARHGAP6,Rho GTPase activating protein 6,NM_013427.3,NP_038286.2,ENST00000337414.9,ENSP00000338967.4,MANE Select,NC_000023.11,11137544,11665920,-,chrX +6537,6537,GeneID:9758,ENSG00000169933.17,HGNC:29007,FRMPD4,FERM and PDZ domain containing 4,NM_001368397.1,NP_001355326.1,ENST00000675598.1,ENSP00000502607.1,MANE Select,NC_000023.11,12138473,12724523,+,chrX +7636,7636,GeneID:11141,ENSG00000169306.11,HGNC:5996,IL1RAPL1,interleukin 1 receptor accessory protein like 1,NM_014271.4,NP_055086.1,ENST00000378993.6,ENSP00000368278.1,MANE Select,NC_000023.11,28587446,29956718,+,chrX +1224,1224,GeneID:1756,ENSG00000198947.18,HGNC:2928,DMD,dystrophin,NM_004006.3,NP_003997.2,ENST00000357033.9,ENSP00000354923.3,MANE Select,NC_000023.11,31119222,33211549,-,chrX +14563,14563,GeneID:117154,ENSG00000126733.22,HGNC:16814,DACH2,dachshund family transcription factor 2,NM_053281.3,NP_444511.1,ENST00000373125.9,ENSP00000362217.4,MANE Select,NC_000023.11,86148451,86832602,+,chrX +9081,9081,GeneID:27328,ENSG00000102290.23,HGNC:8656,PCDH11X,protocadherin 11 X-linked,NM_032968.5,NP_116750.1,ENST00000682573.1,ENSP00000507225.1,MANE Select,NC_000023.11,91779375,92623230,+,chrX +1204,1204,GeneID:1730,ENSG00000147202.19,HGNC:2877,DIAPH2,diaphanous related formin 2,NM_006729.5,NP_006720.1,ENST00000324765.13,ENSP00000321348.8,MANE Select,NC_000023.11,96684842,97604997,+,chrX +8808,8808,GeneID:26280,ENSG00000189108.14,HGNC:5997,IL1RAPL2,interleukin 1 receptor accessory protein like 2,NM_017416.2,NP_059112.1,ENST00000372582.6,ENSP00000361663.1,MANE Select,NC_000023.11,104566199,105767829,+,chrX +6889,6889,GeneID:10178,ENSG00000009694.14,HGNC:8117,TENM1,teneurin transmembrane protein 1,NM_001163278.2,NP_001156750.1,ENST00000422452.4,ENSP00000403954.4,MANE Select,NC_000023.11,124375903,125204312,-,chrX +13107,13107,GeneID:83259,ENSG00000099715.15,HGNC:15813,PCDH11Y,protocadherin 11 Y-linked,NM_001395587.1,NP_001382516.1,ENST00000698851.1,ENSP00000513983.1,MANE Select,NC_000024.10,5000296,5742224,+,chrY diff --git a/rna2protexpression.py b/rna2protexpression.py new file mode 100644 index 0000000..ce019c4 --- /dev/null +++ b/rna2protexpression.py @@ -0,0 +1,133 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, TensorDataset +from axial_attention import AxialAttention +from thefuzz import fuzz +import argparse +from joblib import load + +#load data from docker container +#tissue and ensg preprocessing +tissue2number = load('/home/omic/rna2protexpresson/tissue2number.joblib') +ensg2number = load('/home/omic/rna2protexpresson/ensg2number.joblib') +#model weights +model_path = '/home/omic/rna2protexpresson/go_term_protein_expression_model.pth' +#get device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +#get go terms +go_df_work = pd.read_csv('/home/omic/rna2protexpresson/go_df_work.csv', index_col=0) + +#model +class TissueSpecificProteinExpressionModel(nn.Module): + def __init__(self, emb_dim = 64, dim_heads = 512, n_tissues = 29, n_ensg = 14601, go_size=go_df_work.shape[1]): + super(TissueSpecificProteinExpressionModel, self).__init__() + self.emb_dim = emb_dim + self.emb_ensg = nn.Embedding(n_ensg, int(emb_dim/2)) + self.emb_tiss = nn.Embedding(n_tissues, emb_dim) + self.attn1 = AxialAttention( + dim = 1, # embedding dimension + dim_index = 1, # where is the embedding dimension + dim_heads = dim_heads, # dimension of each head. defaults to dim // heads if not supplied + heads = 1, # number of heads for multi-head attention + num_dimensions = 2, # number of axial dimensions (images is 2, video is 3, or more) + sum_axial_out = True # whether to sum the contributions of attention on each axis, or to run the input through them sequentially. defaults to true + ) + self.attn2 = AxialAttention(dim = emb_dim*2+1, dim_index = 1, dim_heads = dim_heads, heads = 1, num_dimensions = 2, sum_axial_out = True) + self.attn3 = AxialAttention(dim = emb_dim*2+1, dim_index = 1, dim_heads = dim_heads, heads = 1, num_dimensions = 2, sum_axial_out = True) + self.con = nn.Conv2d(emb_dim*2+1, 1, 1, stride=1) + self.batch_norm1 = nn.BatchNorm2d(emb_dim*2+1) + self.batch_norm2 = nn.BatchNorm2d(emb_dim*2+1) + self.emb_go = nn.Linear(go_size, int(emb_dim/2)) + + def forward(self, x, emb_pos_prot, emb_pos_tissu, go_term): + #embeding go terms + emb_in_go = self.emb_go(go_term) + emb_in_go = torch.unsqueeze(torch.permute(emb_in_go, (0,2,1)),-1) + #embeding proteins + emb_in_p = self.emb_ensg(emb_pos_prot) + emb_in_p = torch.unsqueeze(torch.permute(emb_in_p, (0,2,1)),-1) + emb_in_p = torch.cat([emb_in_go,emb_in_p], dim = 1) + emb_in_p = emb_in_p.expand(x.shape[0],self.emb_dim,emb_pos_prot.shape[1],emb_pos_tissu.shape[1]) + #embeding tissues + emb_in_t = self.emb_tiss(emb_pos_tissu) + emb_in_t = torch.unsqueeze(torch.permute(emb_in_t, (0,2,1)),-2) + emb_in_t = emb_in_t.expand(x.shape[0],self.emb_dim,emb_pos_prot.shape[1],emb_pos_tissu.shape[1]) + #RNA expresson + x = torch.unsqueeze(x, 1) + x = self.attn1(x) + #concat protein embedding, tissue embedding and RNA expresson + x = torch.cat([x,emb_in_p, emb_in_t], dim = 1) + x = self.batch_norm1(x) + x = self.attn2(x) + x = self.batch_norm2(x) + x = self.attn3(x) + x = self.con(x) + x = torch.squeeze(x, 1) + return x + + +#run model from pandas dataframe +def run_model(model, X_test, goterm, scaler): + model.eval() + X_test_scaled = [scaler.fit_transform(np.array(i).reshape(i.shape[0], -1)).reshape(i.shape) for i in X_test]# np.array(X_test) + X_test_tensor = torch.FloatTensor(X_test_scaled).to(device) + X_test_tissues = [tissue2number.transform(i.columns) for i in X_test] + X_test_ensg = [ensg2number.fit_transform(i.index) for i in X_test] + X_test_tissues = torch.IntTensor(X_test_tissues).to(device) + X_test_ensg = torch.IntTensor(X_test_ensg).to(device) + X_go = torch.FloatTensor(np.array(goterm)).to(device) + test_dataset = TensorDataset(X_test_tensor, X_test_ensg, X_test_tissues, X_go) + test_loader = DataLoader(test_dataset, batch_size=1) + + with torch.no_grad(): + for batch_X, ensg_numb, tissue_numb, X_go in test_loader: + y_pred = model(batch_X, ensg_numb, tissue_numb, X_go).cpu().numpy() + + return y_pred + +def run_save_protrin_esxpression_predicition(TPM, goterm, model, scaler, tissue2number, ensg2number, split_len, name): + #set column names + TPM.columns = [i.split(':')[1] for i in TPM.columns] + #drop tissue/columns not used in model + TPM = TPM[TPM.columns.intersection(tissue2number.classes_)] + ## this is done with go_df_work merge + #drop ensg/row not used in model + #TPM = TPM.loc[TPM.index.intersection(ensg2number.classes_),:] + #transform to log2 / used to train model + TPM = np.log2(TPM) + + #split TPM in 5 000 chunks + pred = [run_model(model, [i], [j], scaler) for i, j in zip(np.array_split(TPM, int(np.ceil(TPM.shape[0] / split_len))), np.array_split(goterm, int(np.ceil(TPM.shape[0] / split_len))))] + #create Dataframe with tissue and ensg names + pred = pd.DataFrame(np.squeeze(np.concatenate(pred, 1))) + pred = pred.set_index(TPM.index) + pred.columns = TPM.columns + + #save predicitions log2 + pred.to_csv(f'{name}_Protein_Expression_log2.csv') + +checkpoint = torch.load(model_path, map_location=device) +model = TissueSpecificProteinExpressionModel().to(device) +model.load_state_dict(checkpoint['model_state_dict']) +scaler = checkpoint['scaler'] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Predict protein expression from RNA expression') + parser.add_argument('--borzoi_ouput', help='Output from borzoi step', required=True) + + args = parser.parse_args() + + #borzoi output TPM + TPM = pd.read_csv(args.borzoi_ouput,index_col=0) + #get output name + out_name = args.borzoi_ouput.split('_TPM.csv')[0] + #get go term + X_go = pd.merge(TPM, go_df_work, left_index=True, right_index=True).iloc[:,TPM.shape[1]:] + #drop ensg/row not used in model + TPM = pd.merge(TPM, go_df_work, left_index=True, right_index=True).iloc[:,:TPM.shape[1]] + #run and save output to Protein_Expression_log2.csv + run_save_protrin_esxpression_predicition(TPM, X_go, model, scaler, tissue2number, ensg2number, 2500, out_name) + diff --git a/test.nf b/test.nf new file mode 100644 index 0000000..55037b4 --- /dev/null +++ b/test.nf @@ -0,0 +1,217 @@ +nextflow.enable.dsl=2 + + +// ================= IMAGES GO HERE ================================================================ + +params.containerOptions = '--rm' // '--gpus all --rm -v /mnt:/mnt' +params.containerOptions_synthea = "--rm --privileged --gpus all" // -v /mnt:/mnt" +params.containerOptions_borzoi = '--rm --gpus all' // '--gpus all --rm -v /mnt:/mnt' + +params.container_borzoi = 'harbor.cluster.omic.ai/omic/digital-patients/borzoi:latest' +params.container_vcf2prot = "harbor.cluster.omic.ai/omic/digital-patients/vcf2prot:latest" + +params.container_rna2protexpression = 'harbor.cluster.omic.ai/omic/digital-patients/rna2protexpression:latest' +params.containerOptions_rna2protexpression = '--gpus all --rm' // -v /mnt:/mnt -v /dbs:/dbs' +// params.containerOptions_rna2protexpression = '--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs' + +params.container_corto = 'harbor.cluster.omic.ai/omic/digital-patients/corto:latest' +params.container_ecotyper = 'harbor.cluster.omic.ai/omic/digital-patients/ecotyper:latest' + +params.container_synthea = 'harbor.cluster.omic.ai/omic/digital-patients/synthea:cudf' + +// ================================================================================================= + +//SYNTHEA +params.n_pat = 2 //10 //number of patients to generate, must be => 2 (one male one female) +params.percent_male = 0.5 // between 0-1, percent of male patients +//params.age = '18-80' //age range of the population, expressed as age-age +//params.state = '"District of Columbia"' //'Hawaii' //'Texas' //'Georgia' +//params.city = '' //'Washington' //'' //'Honolulu' //'Houston' //'Atlanta' + +// Keep both versions of disease params +params.disease = 'schizophrenia' //'schizophrenia' //'healthy' //'leukaemia' //'leukaemia' //'Purpura and other haemorrhagic conditions' //['tongue cancer', 'dementia', 'arthritis'] //'tongue cancer' //'dementia' + + +//params.n_var = 100 + + +params.project_name = 'test' + +//CIBERSORT +params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu" +params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99" + + + +// ====================== FILEPATHS HERE ========================================================================= + + +params.imputed_store = '/mnt/Avatar/imputed/ukbb/imputed' //'/rosalind/ukbb/imputed' +params.pheno_store ='/mnt/dreamdock-data/digital-patient-data/data/ukbb_phenotypes_filtered.csv' // '/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv' + +//CORTO +params.regulon = '/mnt/dreamdock-data/digital-patient-data/data/regulon.rda' // '/Workspace/next/registry/pipelines/digital_patient/regulon.rda' + +params.healthy_dir = '/mnt/dreamdock-data/digital-patient-data/healthy' // '/Workspace/next/registry/pipelines/digital_patient' //'/mnt/Avatar/digital_patient' // data copy is here + +params.synthea_support_dir = '/mnt/Avatar/dd/synthea/supporting-data/' + +//BORZOI +params.mane = '/mnt/dreamdock-data/digital-patient-data/data/MANE.GRCh38.v1.3.update.tsv' // '/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv' + +//CIBERSORT +params.signature_matrix = '/mnt/dreamdock-data/digital-patient-data/data/LM22_sourceGEP_ensg.txt' // "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt" + + +params.outdir = '/mnt/dreamdock-data/digital-patient-data/out' // '/mnt/OmicNAS/dd/digital_patient/new' +// params.outdir = '/data/digital-patients-data' // '/mnt/OmicNAS/dd/digital_patient/new' +params.ecotyper_outdir = "${params.outdir}/ecotyper" + +// ====================== FILTERING PARAMETERS ==================================================================== + +// Enable/disable filtering +params.enable_filtering = true + +// Filtering thresholds based on Gabe's requirements +params.top_n_variants = 500 +params.top_n_genes = 1000 +params.transcriptome_log2fc_threshold = 1.5 +params.top_n_proteins = 500 +params.top_n_immune_cells = 20 +params.top_n_metabolites = 50 +params.metabolome_pvalue_threshold = 0.05 + +// =============================================================================================================== + +//SYNTHEA +include {get_disease_stats_no_patients} from './main_synthea.nf' +include {generate_m_variants_cudf} from './main_synthea.nf' +include {generate_f_variants_cudf} from './main_synthea.nf' +include {make_vcfs} from './main_synthea.nf' +include {generate_m_healthy_cudf} from './main_synthea.nf' +include {generate_f_healthy_cudf} from './main_synthea.nf' + +//BORZOI +include {FILTER_VCF} from './main_borzoi.nf' +include {PREDICT_EXPRESSION} from './main_borzoi.nf' +include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf' + +//VCF2PROT +include {VCF2PROT} from './main_vcf2prot.nf' + +//RNA2PROTEINEXPRESSION +include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression' + +//CORTO +include {CORTO} from './main_corto.nf' + +//CIBERSORT +include {CONVERT_TO_TXT} from './main_cibersortx.nf' +include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf' +include {CIBERSORTx_HIRES} from './main_cibersortx.nf' +include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf' + +//FILTERING PROCESSES +include {FILTER_VARIANTS} from './main_filter_outputs.nf' +include {FILTER_TRANSCRIPTOME} from './main_filter_outputs.nf' +include {FILTER_PROTEOME} from './main_filter_outputs.nf' +include {FILTER_IMMUNE_CELLS} from './main_filter_outputs.nf' +include {FILTER_METABOLOME} from './main_filter_outputs.nf' +include {FILTER_MUTATED_PROTEINS} from './main_filter_outputs.nf' +include {CREATE_SUMMARY_REPORT} from './main_filter_outputs.nf' + + +workflow { + pheno_store_ch = file(params.pheno_store) + imputed_store_ch = file(params.imputed_store) + synthea_support_ch = file(params.synthea_support_dir) + regulon_ch = file(params.regulon) + signature_matrix_ch = file(params.signature_matrix) + mane_ch = file(params.mane) + health_dir_ch = file(params.healthy_dir) + + //SYNTHEA + switch (params.disease) { + case 'healthy': + //healthy + generate_m_healthy_cudf(health_dir_ch) + generate_f_healthy_cudf(health_dir_ch) + m_healthy = generate_m_healthy_cudf.out + f_healthy = generate_f_healthy_cudf.out + txt_ch = f_healthy.mix(m_healthy).flatten() + break + default: + //disease + get_disease_stats_no_patients(pheno_store_ch, imputed_store_ch) + generate_m_variants_cudf(get_disease_stats_no_patients.out) + generate_f_variants_cudf(get_disease_stats_no_patients.out) + f_var = generate_f_variants_cudf.out + m_var = generate_m_variants_cudf.out + txt_ch = f_var.mix(m_var).flatten() + } + make_vcfs(txt_ch, synthea_support_ch) + + // TODO: ADD STEP TO CREATE AVERAGED COHORT (HEALTHY) + // TODO: ADD STEP TO CREATE AVERAGED DISEASED + // TODO: ADD STEP TO CALCULATE STATISTICS LOG2FC and ADJ PVAL + + //BORZOI + // Prepare the VCF file(s) + //vcf_ch = Channel.fromPath(make_vcfs.out) + // THIS IS USES TO CREATE PROTEIN CLUSTER ON A SAME BORZOI INPUT AND SPLIT ONES TOO BIG FOR BORZOI (LEN = 524288) + // THIS STEP NEEDS TO BE RUN TO CREATE DATA FOR NEXT STEP ONLY IF NEW REFERENCE/RNAS ARE GOING TO BE USED + //CREATE_PROTEIN_CLUSTER(params.mane) + FILTER_VCF(mane_ch, make_vcfs.out) + PREDICT_EXPRESSION(FILTER_VCF.out, mane_ch) + + //VCF2PROT + VCF2PROT(make_vcfs.out, FILTER_VCF.out) + + //RNA2PROTEINEXPRESSION + // Updated RNA to protein expression step to run only a single process at a time, avoids CUDA out-of-memory errors. + PREDICT_EXPRESSION.out + .collect() + .flatten() + .set { rna_input } + RNA2PROTEXPRESSION(rna_input) + + //CORTO + CORTO(PREDICT_EXPRESSION.out, regulon_ch) + + //CIBERSORT + signature_file = Channel.fromPath(signature_matrix_ch, checkIfExists: true) + CONVERT_TO_TXT(PREDICT_EXPRESSION.out) + CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file) + CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file) + ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out) + + // ==================== FILTERING STAGE ==================== + if (params.enable_filtering) { + // Filter variants + FILTER_VARIANTS(make_vcfs.out) + + // Filter transcriptome + FILTER_TRANSCRIPTOME(PREDICT_EXPRESSION.out) + + // Filter proteome + FILTER_PROTEOME(RNA2PROTEXPRESSION.out) + + // Filter immune cells + FILTER_IMMUNE_CELLS(ADD_TISSUE_NAMES_TO_CIBERSORTX.out) + + // Filter metabolome + FILTER_METABOLOME(CORTO.out) + + // Filter mutated proteins + FILTER_MUTATED_PROTEINS(VCF2PROT.out, FILTER_VARIANTS.out.filtered_vcf) + + // Create summary report + CREATE_SUMMARY_REPORT( + FILTER_TRANSCRIPTOME.out, + FILTER_PROTEOME.out, + FILTER_IMMUNE_CELLS.out, + FILTER_METABOLOME.out, + FILTER_VARIANTS.out.filtered_vcf + ) + } +} diff --git a/test.nf.bk b/test.nf.bk new file mode 100644 index 0000000..5460884 --- /dev/null +++ b/test.nf.bk @@ -0,0 +1,102 @@ +nextflow.enable.dsl=2 + +//SYNTHEA +params.n_pat = 2//10 //number of patients to generate, must be => 2 (one male one female) +params.percent_male = 0.5 // between 0-1, percant of mail patients +//params.age = '18-80' //age range of the population, expressed as age-age +//params.state = '"District of Columbia"' //'Hawaii' //'Texas' //'Georgia' +//params.city = '' //'Washington' //'' //'Honolulu' //'Houston' //'Atlanta' +params.imputed_store = '/mnt/Avatar/imputed/ukbb/imputed' //'/rosalind/ukbb/imputed' +params.pheno_store = '/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv' +params.disease = 'healthy' //'schizophrenia' //'healthy' //'leukaemia' //'leukaemia' //'Purpura and other haemorrhagic conditions' //['tongue cancer', 'dementia', 'arthritis'] //'tongue cancer' //'dementia' +params.healthy_dir = '/Workspace/next/registry/pipelines/digital_patient' //'/mnt/Avatar/digital_patient' // data copy is here +//params.n_var = 100 +params.outdir = '/mnt/OmicNAS/dd/digital_patient' +//BORZOI +params.container_borzoi = 'borzoi:latest' +params.containerOptions = '--gpus all --rm -v /mnt:/mnt' +params.project_name = 'test' +params.mane = '/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv' +//VCF2PROT +params.container_vcf2prot = "vcf2prot:latest" +//RNA2PROTEINEXPRESSION +params.container_rna2protexpression = 'rna2protexpression:latest' +params.containerOptions_rna2protexpression = '--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs' +//CORTO +params.container_corto = 'corto:latest' +params.regulon = '/Workspace/next/registry/pipelines/digital_patient/regulon.rda' +//CIBERSORT +params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu" +params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99" +params.container_ecotyper = 'ecotyper:latest' +params.signature_matrix = "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt" +params.ecotyper_outdir = "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper" + + +//SYNTHEA +include {get_disease_stats_no_patients} from './main_synthea.nf' +include {generate_m_variants_cudf} from './main_synthea.nf' +include {generate_f_variants_cudf} from './main_synthea.nf' +include {make_vcfs} from './main_synthea.nf' +include {generate_m_healthy_cudf} from './main_synthea.nf' +include {generate_f_healthy_cudf} from './main_synthea.nf' +//BORZOI +include {FILTER_VCF} from './main_borzoi.nf' +include {PREDICT_EXPRESSION} from './main_borzoi.nf' +include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf' +//VCF2PROT +include {VCF2PROT} from './main_vcf2prot.nf' +//RNA2PROTEINEXPRESSION +include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression' +//CORTO +include {CORTO} from './main_corto.nf' +//CIBERSORT +include {CONVERT_TO_TXT} from './main_cibersortx.nf' +include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf' +include {CIBERSORTx_HIRES} from './main_cibersortx.nf' +include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf' + +workflow { + //SYNTHEA + switch (params.disease) { + case 'healthy': + //healthy + generate_m_healthy_cudf(params.healthy_dir) + generate_f_healthy_cudf(params.healthy_dir) + m_healthy = generate_m_healthy_cudf.out + f_healthy = generate_f_healthy_cudf.out + txt_ch = f_healthy.mix(m_healthy).flatten() + break + default: + //disease + get_disease_stats_no_patients() + generate_m_variants_cudf(get_disease_stats_no_patients.out) + generate_f_variants_cudf(get_disease_stats_no_patients.out) + f_var = generate_f_variants_cudf.out + m_var = generate_m_variants_cudf.out + txt_ch = f_var.mix(m_var).flatten() + } + make_vcfs(txt_ch) + //BORZOI + // Prepare the VCF file(s) + //vcf_ch = Channel.fromPath(make_vcfs.out) + // THIS IS USES TO CREATE PROTEIN CLUSTER ON A SAME BORZOI INPUT AND SPLI ONES TO BIG FOR BORZOI (LEN = 524288) + // THIS STEP NEEDS TO BR RUN TO CREATE DATA FOR NEXT STEP ONLY IF NEW REFERENCE/RNAS ARE GOING TO BE USE + //CREATE_PROTEIN_CLUSTER(params.mane) + FILTER_VCF(params.mane, make_vcfs.out) + PREDICT_EXPRESSION(FILTER_VCF.out, params.mane) + //VCF2PROT + VCF2PROT(make_vcfs.out, FILTER_VCF.out) + //RNA2PROTEINEXPRESSION + //collect is here onyl to stop RNA2PROTEXPRESSION run at the same time as BORZOI (not enough CUDA space) + RNA2PROTEXPRESSION(PREDICT_EXPRESSION.out.collect()) + //CORTO + CORTO(PREDICT_EXPRESSION.out, params.regulon) + //CIBERSORT + signature_file = Channel.fromPath(params.signature_matrix, checkIfExists: true) + CONVERT_TO_TXT(PREDICT_EXPRESSION.out) + CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file) + CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file) + ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out) +} + diff --git a/test_gen_patient.nf b/test_gen_patient.nf new file mode 100644 index 0000000..716ff59 --- /dev/null +++ b/test_gen_patient.nf @@ -0,0 +1,125 @@ +nextflow.enable.dsl=2 + +//SYNTHEA +params.n_pat = 2 //10 //number of patients to generate, must be => 2 (one male one female) +params.percent_male = 0.5 // between 0-1, percent of male patients +params.age = '18-80' //age range of the population, expressed as age-age +// params.state = '"District of Columbia"' //'Hawaii' //'Texas' //'Georgia' +// params.city = '' //'Washington' //'' //'Honolulu' //'Houston' //'Atlanta' +params.imputed_store = '/mnt/Avatar/imputed/ukbb/imputed' //'/rosalind/ukbb/imputed' +params.pheno_store = '/mnt/Avatar/dd/synthea/metadata/ukbb_phenotypes_filtered.csv' + +// Keep both versions of disease params +params.disease = 'schizophrenia' //'schizophrenia' //'healthy' //'leukaemia' //'leukaemia' //'Purpura and other haemorrhagic conditions' //['tongue cancer', 'dementia', 'arthritis'] //'tongue cancer' //'dementia' +params.healthy_dir = '/Workspace/next/registry/pipelines/digital_patient' //'/mnt/Avatar/digital_patient' // data copy is here + +//params.n_var = 100 +params.outdir = '/mnt/OmicNAS/dd/digital_patient' + +//BORZOI +params.container_borzoi = 'borzoi:latest' +params.containerOptions = '--gpus all --rm -v /mnt:/mnt' +params.project_name = 'test' +params.mane = '/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv' + +//VCF2PROT +params.container_vcf2prot = "vcf2prot:latest" + +//RNA2PROTEINEXPRESSION +params.container_rna2protexpression = 'rna2protexpression:latest' +params.containerOptions_rna2protexpression = '--gpus all --rm -v /mnt:/mnt -v /dbs:/dbs' + +//CORTO +params.container_corto = 'corto:latest' +params.regulon = '/Workspace/next/registry/pipelines/digital_patient/regulon.rda' + +//CIBERSORT +params.cibersortx_username = "gabriel.richman.2009@anderson.ucla.edu" +params.cibersortx_token = "b5b39e563fb947df4cfd4843d40fdb99" +params.container_ecotyper = 'ecotyper:latest' +params.signature_matrix = "/Workspace/next/registry/pipelines/digital_patient/LM22_sourceGEP_ensg.txt" +params.ecotyper_outdir = "/mnt/OmicNAS/olamide/ecotyper/results/ecotyper" + +//SYNTHEA +include {generate_patients} from './main_synthea.nf' +// include {get_disease_stats_no_patients} from './main_synthea.nf' +// include {generate_m_variants_cudf} from './main_synthea.nf' +// include {generate_f_variants_cudf} from './main_synthea.nf' +// include {make_vcfs} from './main_synthea.nf' +// include {generate_m_healthy_cudf} from './main_synthea.nf' +// include {generate_f_healthy_cudf} from './main_synthea.nf' + +// //BORZOI +// include {FILTER_VCF} from './main_borzoi.nf' +// include {PREDICT_EXPRESSION} from './main_borzoi.nf' +// include {CREATE_PROTEIN_CLUSTER} from './main_borzoi.nf' + +// //VCF2PROT +// include {VCF2PROT} from './main_vcf2prot.nf' + +// //RNA2PROTEINEXPRESSION +// include {RNA2PROTEXPRESSION} from './main_rna2proteinexpression' + +// //CORTO +// include {CORTO} from './main_corto.nf' + +// //CIBERSORT +// include {CONVERT_TO_TXT} from './main_cibersortx.nf' +// include {CIBERSORTx_FRACTIONS} from './main_cibersortx.nf' +// include {CIBERSORTx_HIRES} from './main_cibersortx.nf' +// include {ADD_TISSUE_NAMES_TO_CIBERSORTX} from './main_cibersortx.nf' + +workflow { + generate_patients() + + // //SYNTHEA + // switch (params.disease) { + // case 'healthy': + // //healthy + // generate_m_healthy_cudf(params.healthy_dir) + // generate_f_healthy_cudf(params.healthy_dir) + // m_healthy = generate_m_healthy_cudf.out + // f_healthy = generate_f_healthy_cudf.out + // txt_ch = f_healthy.mix(m_healthy).flatten() + // break + // default: + // //disease + // get_disease_stats_no_patients() + // generate_m_variants_cudf(get_disease_stats_no_patients.out) + // generate_f_variants_cudf(get_disease_stats_no_patients.out) + // f_var = generate_f_variants_cudf.out + // m_var = generate_m_variants_cudf.out + // txt_ch = f_var.mix(m_var).flatten() + // } + // make_vcfs(txt_ch) + + // //BORZOI + // // Prepare the VCF file(s) + // //vcf_ch = Channel.fromPath(make_vcfs.out) + // // THIS IS USES TO CREATE PROTEIN CLUSTER ON A SAME BORZOI INPUT AND SPLIT ONES TOO BIG FOR BORZOI (LEN = 524288) + // // THIS STEP NEEDS TO BE RUN TO CREATE DATA FOR NEXT STEP ONLY IF NEW REFERENCE/RNAS ARE GOING TO BE USED + // //CREATE_PROTEIN_CLUSTER(params.mane) + // FILTER_VCF(params.mane, make_vcfs.out) + // PREDICT_EXPRESSION(FILTER_VCF.out, params.mane) + + // //VCF2PROT + // VCF2PROT(make_vcfs.out, FILTER_VCF.out) + + // //RNA2PROTEINEXPRESSION + // // Updated RNA to protein expression step to run only a single process at a time, avoids CUDA out-of-memory errors. + // PREDICT_EXPRESSION.out + // .collect() + // .flatten() + // .set { rna_input } + // RNA2PROTEXPRESSION(rna_input) + + // //CORTO + // CORTO(PREDICT_EXPRESSION.out, params.regulon) + + // //CIBERSORT + // signature_file = Channel.fromPath(params.signature_matrix, checkIfExists: true) + // CONVERT_TO_TXT(PREDICT_EXPRESSION.out) + // CIBERSORTx_FRACTIONS(CONVERT_TO_TXT.out, signature_file) + // CIBERSORTx_HIRES(CONVERT_TO_TXT.out, CIBERSORTx_FRACTIONS.out, signature_file) + // ADD_TISSUE_NAMES_TO_CIBERSORTX(CONVERT_TO_TXT.out, CIBERSORTx_HIRES.out) +} diff --git a/test_no_mutations.nf b/test_no_mutations.nf new file mode 100644 index 0000000..d56de00 --- /dev/null +++ b/test_no_mutations.nf @@ -0,0 +1,14 @@ +nextflow.enable.dsl=2 + +params.container_borzoi = 'borzoi:latest' +params.containerOptions = '--gpus all --rm -v /mnt:/mnt' +params.outdir = '/mnt/OmicNAS/private/old/gabe/borzoi/outputs' +params.project_name = 'test' + +params.mane = '/Workspace/next/registry/pipelines/digital_patient/MANE.GRCh38.v1.3.update.tsv' + +include {PREDICT_EXPRESSION_NO_MUTATIONS} from './main_no_mutations.nf' + +workflow { + PREDICT_EXPRESSION_NO_MUTATIONS(params.mane) +} diff --git a/tissue2number.joblib b/tissue2number.joblib new file mode 100644 index 0000000000000000000000000000000000000000..e9a11f18f495802cca335ceb06ec226426c4f6ed GIT binary patch literal 952 zcmZ8gNpBM|6mHU{%QW5DyRbvbGN?#My&;6S5R^)}Fc)NbJSH*q*z;^pN~9u{Ln)Q! zhWmTrjT;W&!}z`D?|b{4%v`KDroySV?4jh;sJJk^Fq&~|g&M>}(v(YIwLjGNJH>Ry zjsN9$zx!{#Q_V9;t>u>ZYU7JerAT8{=Eay4f}uZOt@Ud4jWKlm(a@s!hiuC<@O{+^ zEMLv3jMi9;P_^uk7TkBYPf^yUj$1X}qMW%g+K27G5@MSN`~LjrneY2fce-jE`(Cg2 z{x_cR>ArS8ts*L=J|cNe3x8g1UgjC15h}$%Kc z)|MRBlciuZTv15|lBx_gk~y@cl4jUlxRSJkrgcU}IDanTYO+|HwBjVk>h%#^OIo$X zP@&*@s3kR=S_d}*n{hJ|UrFh>5pXkLnU-3?tz-dt-0EUT2c?D0kmKiiCWWAixH9Fi z6$nGzLkHWz`Ju=ZAHz;~kYa>s-cDxFlPuK$?gUI;S|&N%4H$(9aJUy_J)&j|_mg?V z7gVu2QFw5v8f_hQ93F-|EN(yzp7iBW2w@Mg#>dGDvTcru5}|N8aBv(tA+RkZ=L&X1 z7p>DdWkYxpo&wOq)9~bmW1~d|&%#sP8*ISypp581C<-xm@FI;bb}&|bs#P-iA8``K Q3axPO_u^OaOJAn(A0&)DLI3~& literal 0 HcmV?d00001