Update CORTO pipeline with params.json and workflow improvements

2025-03-28 14:45:49 -07:00
parent bf380f2768
commit aaa43a4d54
6 changed files with 127 additions and 92 deletions
--- a/0
+++ b/0
--- a/README.md
+++ b/README.md
--- a/main.nf
+++ b/main.nf
@@ -1,7 +1,7 @@
 nextflow.enable.dsl=2
 process CORTO {
-  container "${params.container}"
+  container "${params.container_corto}"
  containerOptions "${params.containerOptions}"
  publishDir "${params.outdir}/${params.project_name}", mode: 'copy'
  debug true
@@ -9,97 +9,36 @@ process CORTO {
  stageInMode 'copy'
  input:
-    path gene_expression_matrix
+    path TPM
-    path centroid_list
+    path regulon
-    path cnv_data // This could be optional
+    // path TPM_ref
  // Define output channels
  output:
-    path "regulon.rda", emit: regulon
+    path "*_metabolome.csv", emit: csv_metabol
    path "*.csv", emit: csv_regulon
    path "*.log", optional: true, emit: logs // if you have log files
-    script:
+  script:
    """
-    #!/bin/bash
+    #!/usr/bin/Rscript
    # Create an R script
    cat <<EOF > corto_analysis.R
    # Print the R version
    print(R.version.string)
    # Load the corto library
    library(corto)
    library(data.table)
-    # Function to load data
+    TPM <- as.matrix(fread("$TPM"),rownames=1)
-    loadData <- function(file_name, expected_var) {
+    #TPM_ref <- as.matrix(fread("TPM_ref"),rownames=1) #dollar sign is missing
        load(file_name)
        if (exists(expected_var)) {
            data <- get(expected_var)
        } else {
            stop(paste("Object", expected_var, "not found in", file_name))
        }
        return(data)
    }
-    # Load the input matrix
+    #try normalization; dont use
-    inmat <- loadData("${gene_expression_matrix}", "inmat")
+    #TPM <- scale(TPM)
-    print("Dimensions of inmat before any operation:")
+    #TPM_log2 = log2(TPM + 1)
    print(dim(inmat))
-    # Load the centroids
+    load("$regulon")
    centroids <- loadData("${centroid_list}", "centroids")
    print("Length of centroids:")
    print(length(centroids))
-    # Run corto with specified parameters
+    predicted<-mra(TPM, regulon=regulon)
    regulon <- corto(inmat, centroids=centroids, nbootstraps=10, p=1e-30, nthreads=2)
-    # Save the regulon object for later use
+    #use this if we want to compare patients and reference TPM, data formet output is not csv -> work on data saving if needed
-    save(regulon, file="regulon.rda")
+    #predicted<-mra(TPM, path TPM_ref, regulon=regulon)
    # Transform regulon into a data frame
    regulon_to_df <- function(regulon) {
        result_df <- data.frame(TF = character(),
                                Target = character(),
                                TFMode = numeric(),
                                Likelihood = numeric(),
                                stringsAsFactors = FALSE)
        for (tf in names(regulon)) {
            tf_data <- regulon[[tf]]
            if (is.null(tf_data\$tfmode) || is.null(tf_data\$likelihood)) next
            for (i in seq_along(tf_data\$tfmode)) {
                tf_mode <- tf_data\$tfmode[[i]]
                likelihood <- tf_data\$likelihood[[i]]
                target_name <- names(tf_data\$tfmode)[i]
                tf_df <- data.frame(TF = tf,
                                    Target = target_name,
                                    TFMode = tf_mode,
                                    Likelihood = likelihood,
                                    stringsAsFactors = FALSE)
                result_df <- rbind(result_df, tf_df)
            }
        }
        return(result_df)
    }
    # Check if regulon is a list and transform it
    if (is.list(regulon)) {
        regulon_df <- regulon_to_df(regulon)
        write.csv(regulon_df, file="regulon.csv", row.names=FALSE)
    } else {
        warning("Regulon object is not a list. Custom transformation needed.")
    }
    EOF
    # Execute the R script
    Rscript corto_analysis.R
    name = strsplit(strsplit("$TPM", split = "/")[[1]][length(strsplit("$TPM", split = "/")[[1]])],  split = "_TPM.csv")[[1]][1]
    name = paste(name, "_metabolome.csv", sep="")
    write.csv(predicted, name)
    """
 }
--- a/nextflow.config
+++ b/nextflow.config
--- a/params.json
+++ b/params.json
@@ -0,0 +1,101 @@
 {
    "params": {
        "TPM": {
            "type": "file",
            "description": "Path to TPM (Transcripts Per Million) CSV file",
            "default": "/data/olamide/corto/corto_metabolite_prediction/20002_1289_female_patient_0_TPM.csv",
            "required": true,
            "pipeline_io": "input",
            "var_name": "params.TPM",
            "examples": [
                "/data/olamide/corto/corto_metabolite_prediction/20002_1289_female_patient_0_TPM.csv",
                "/path/to/your/sample_TPM.csv"
            ],
            "pattern": ".*\\.csv$",
            "enum": [],
            "validation": {},
            "notes": "A CSV file containing TPM values with ENSG IDs as rows and samples as columns"
        },
        "regulon": {
            "type": "file",
            "description": "Path to regulon RDA file",
            "default": "/data/olamide/corto/corto_metabolite_prediction/regulon.rda",
            "required": true,
            "pipeline_io": "input",
            "var_name": "params.regulon",
            "examples": [
                "/data/olamide/corto/corto_metabolite_prediction/regulon.rda",
                "/path/to/your/custom_regulon.rda"
            ],
            "pattern": ".*\\.rda$",
            "enum": [],
            "validation": {},
            "notes": "An R data file containing regulon information for the CORTO analysis"
        },
        "outdir": {
            "type": "folder",
            "description": "Directory for CORTO analysis results",
            "default": "/mnt/OmicNAS/private/old/gabe/corto/outputs",
            "required": true,
            "pipeline_io": "output",
            "var_name": "params.outdir",
            "examples": [
                "/mnt/OmicNAS/private/old/gabe/corto/outputs",
                "/path/to/custom/output"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Directory where metabolome prediction results will be stored"
        },
        "project_name": {
            "type": "string",
            "description": "Project name for output directory organization",
            "default": "test",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.project_name",
            "examples": [
                "test",
                "patient_analysis",
                "metabolite_prediction_run1"
            ],
            "pattern": "^[A-Za-z0-9_-]+$",
            "enum": [],
            "validation": {},
            "notes": "Name used to create a subdirectory within the output directory"
        },
        "container_corto": {
            "type": "string",
            "description": "Docker container for CORTO",
            "default": "corto:latest",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.container_corto",
            "examples": [
                "corto:latest",
                "yourregistry/corto:v1.0"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Docker container image to use for the CORTO analysis"
        },
        "containerOptions": {
            "type": "string",
            "description": "Docker container options",
            "default": "--gpus all --rm -v /mnt:/mnt",
            "required": false,
            "pipeline_io": "parameter",
            "var_name": "params.containerOptions",
            "examples": [
                "--gpus all --rm -v /mnt:/mnt",
                "--rm -v /custom/path:/mnt"
            ],
            "pattern": ".*",
            "enum": [],
            "validation": {},
            "notes": "Options to pass to the Docker container runtime"
        }
    }
 }
--- a/test.nf
+++ b/test.nf
@@ -1,22 +1,17 @@
 nextflow.enable.dsl=2
 // GLOBAL FPSIM2 PARAMS
-params.container = 'corto:latest'
+params.container_corto = 'corto:latest'
 params.containerOptions = '--gpus all --rm -v /mnt:/mnt'
 params.outdir = '/mnt/OmicNAS/private/old/gabe/corto/outputs'
 params.project_name = 'test'
 // // INPUTS
-params.gene_expression_matrix = '/mnt/OmicNAS/private/old/gabe/corto/inputs/inmat.rda'
+params.TPM = '/data/olamide/corto/corto_metabolite_prediction/20002_1289_female_patient_0_TPM.csv'
-params.centroid_list = '/mnt/OmicNAS/private/old/gabe/corto/inputs/centroids.rda'
+params.regulon = '/data/olamide/corto/corto_metabolite_prediction/regulon.rda'
-params.cnv_data = '/mnt/OmicNAS/private/old/gabe/corto/inputs/cnvmat.rda'
+//params.TPM_REFERENCE = '/data/olamide/corto/corto_metabolite_prediction/TPM_ENSG_NO_MUTATIONS.csv'
 include {CORTO} from './main.nf'
 workflow {
-  gene_expression_matrix = Channel.fromPath(params.gene_expression_matrix)
+  CORTO(params.TPM, params.regulon)// , params.TPM_REFERENCE)
  centroid_list = Channel.fromPath(params.centroid_list)
  cnv_data = Channel.fromPath(params.cnv_data)
  CORTO(gene_expression_matrix, centroid_list, cnv_data)
 }