Added custom data prep and matrix combination steps meant to perform similar but improved functions to the R code. Added readme detailing the code.

2024-12-16 15:03:40 +00:00
parent bf380f2768
commit 21d77e3faa
3 changed files with 1150 additions and 70 deletions
--- a/corto-matrix-combination.py
+++ b/corto-matrix-combination.py
@@ -0,0 +1,410 @@
+import pandas as pd
+import numpy as np
+from scipy import stats
+from sklearn.linear_model import LinearRegression
+from typing import Dict, List, Optional, Tuple, Literal
+import logging
+from concurrent.futures import ProcessPoolExecutor
+import argparse
+import warnings
+warnings.filterwarnings('ignore')
+
+def setup_logger(verbose: bool = False) -> logging.Logger:
+    """Setup logging configuration"""
+    logger = logging.getLogger('CortoNetwork')
+    logger.setLevel(logging.INFO if verbose else logging.WARNING)
+    
+    # Create console handler with formatting
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    
+    return logger
+
+def load_data(expression_file: str, metabolite_file: str, logger: logging.Logger) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Load and preprocess data files"""
+    logger.info("Loading expression data...")
+    exp_df = pd.read_csv(expression_file)
+    
+    # Set multi-index and convert to numeric matrix
+    logger.info("Processing expression data...")
+    exp_df.set_index(['gene_id', 'transcript_id'], inplace=True)
+    exp_df = exp_df.apply(pd.to_numeric, errors='coerce')
+    exp_df.index = [f"{idx[0]}_{idx[1]}" for idx in exp_df.index]
+    
+    logger.info(f"Expression matrix shape: {exp_df.shape}")
+    
+    # Load metabolite data
+    logger.info("Loading metabolomics data...")
+    met_df = pd.read_csv(metabolite_file)
+    
+    logger.info("Processing metabolomics data...")
+    met_df.set_index('CCLE_ID', inplace=True)
+    met_df = met_df.select_dtypes(include=[np.number])
+    met_df = met_df.T
+    
+    logger.info(f"Metabolomics matrix shape: {met_df.shape}")
+    
+    # Align samples
+    common_samples = list(set(exp_df.columns) & set(met_df.columns))
+    if not common_samples:
+        raise ValueError("No common samples between matrices")
+    
+    logger.info(f"Found {len(common_samples)} common samples")
+    exp_df = exp_df[common_samples]
+    met_df = met_df[common_samples]
+    
+    return exp_df, met_df
+
+def remove_zero_variance(df: pd.DataFrame, logger: logging.Logger) -> pd.DataFrame:
+    """Remove features with zero variance"""
+    logger.info(f"Checking variance in matrix of shape {df.shape}")
+    vars = df.var(axis=1)
+    keep = vars[vars > 0].index
+    logger.info(f"Keeping {len(keep)} features with non-zero variance")
+    return df.loc[keep]
+
+def p2r(p: float, n: int) -> float:
+    """Convert p-value to correlation coefficient threshold"""
+    t = stats.t.ppf(p/2, df=n-2, loc=0, scale=1)
+    r = np.sqrt((t**2)/(n-2 + t**2))
+    return r
+
+def calculate_correlations_corto(expression_df: pd.DataFrame,
+                               metabolite_df: pd.DataFrame,
+                               r_threshold: float,
+                               logger: logging.Logger) -> pd.DataFrame:
+    """Calculate correlations keeping matrices separate (corto approach)"""
+    logger.info("Calculating correlations...")
+    
+    # Calculate correlations in chunks to save memory
+    chunk_size = 1000  # Adjust based on available memory
+    n_chunks = int(np.ceil(len(expression_df) / chunk_size))
+    
+    edges = []
+    for i in range(n_chunks):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(expression_df))
+        
+        logger.info(f"Processing chunk {i+1}/{n_chunks}")
+        exp_chunk = expression_df.iloc[start_idx:end_idx]
+        
+        # Calculate correlations for this chunk
+        chunk_corr = pd.DataFrame(
+            np.corrcoef(exp_chunk, metabolite_df)[
+                :exp_chunk.shape[0],
+                exp_chunk.shape[0]:
+            ],
+            index=exp_chunk.index,
+            columns=metabolite_df.index
+        )
+        
+        # Find significant correlations
+        for gene in chunk_corr.index:
+            for metabolite in chunk_corr.columns:
+                corr = chunk_corr.loc[gene, metabolite]
+                if abs(corr) >= r_threshold:
+                    edges.append({
+                        'source': gene,
+                        'target': metabolite,
+                        'correlation': corr,
+                        'type': 'gene_metabolite'
+                    })
+        
+        # Clear memory
+        del chunk_corr
+    
+    logger.info(f"Found {len(edges)} significant correlations")
+    return pd.DataFrame(edges)
+
+def calculate_correlations_combined(expression_df: pd.DataFrame,
+                                  metabolite_df: pd.DataFrame, 
+                                  r_threshold: float,
+                                  logger: logging.Logger) -> pd.DataFrame:
+    """Calculate correlations using combined matrix approach"""
+    logger.info("Combining matrices...")
+    
+    # Add prefixes and combine
+    exp_prefixed = expression_df.copy()
+    exp_prefixed.index = 'GENE_' + exp_prefixed.index
+    
+    met_prefixed = metabolite_df.copy()  
+    met_prefixed.index = 'MET_' + met_prefixed.index
+    
+    combined_df = pd.concat([exp_prefixed, met_prefixed])
+    
+    logger.info("Calculating correlations...")
+    
+    edges = []
+    chunk_size = 1000
+    n_chunks = int(np.ceil(len(combined_df) / chunk_size))
+    
+    for i in range(n_chunks):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(combined_df))
+        
+        logger.info(f"Processing chunk {i+1}/{n_chunks}")
+        chunk = combined_df.iloc[start_idx:end_idx]
+        
+        chunk_corr = pd.DataFrame(
+            np.corrcoef(chunk, combined_df)[
+                :chunk.shape[0],
+                chunk.shape[0]:
+            ],
+            index=chunk.index,
+            columns=combined_df.index
+        )
+        
+        for source in chunk_corr.index:
+            for target in chunk_corr.columns:
+                if source < target:  # Only take upper triangle
+                    corr = chunk_corr.loc[source, target]
+                    if abs(corr) >= r_threshold:
+                        type = 'gene_gene' if 'GENE_' in source and 'GENE_' in target else \
+                              'metabolite_metabolite' if 'MET_' in source and 'MET_' in target else \
+                              'gene_metabolite'
+                        edges.append({
+                            'source': source,
+                            'target': target,
+                            'correlation': corr,
+                            'type': type
+                        })
+                        
+        del chunk_corr
+        
+    logger.info(f"Found {len(edges)} significant correlations")
+    return pd.DataFrame(edges)
+
+def bootstrap_network(expression_df: pd.DataFrame,
+                     metabolite_df: pd.DataFrame,
+                     r_threshold: float,
+                     seed: int,
+                     logger: logging.Logger) -> List[str]:
+    """Bootstrap for a single iteration"""
+    np.random.seed(seed)
+    
+    # Sample with replacement
+    sample_idx = np.random.choice(
+        expression_df.shape[1],
+        size=expression_df.shape[1],
+        replace=True
+    )
+    
+    # Sample matrices
+    boot_expression = expression_df.iloc[:, sample_idx]
+    boot_metabolite = metabolite_df.iloc[:, sample_idx]
+    
+    # Calculate correlations for bootstrap sample
+    edges = []
+    chunk_size = 1000  # Process in chunks to save memory
+    n_chunks = int(np.ceil(len(boot_expression) / chunk_size))
+    
+    for i in range(n_chunks):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(boot_expression))
+        
+        exp_chunk = boot_expression.iloc[start_idx:end_idx]
+        
+        # Calculate correlations for this chunk
+        chunk_corr = pd.DataFrame(
+            np.corrcoef(exp_chunk, boot_metabolite)[
+                :exp_chunk.shape[0],
+                exp_chunk.shape[0]:
+            ],
+            index=exp_chunk.index,
+            columns=boot_metabolite.index
+        )
+        
+        # Find significant correlations
+        for gene in chunk_corr.index:
+            for metabolite in chunk_corr.columns:
+                corr = chunk_corr.loc[gene, metabolite]
+                if abs(corr) >= r_threshold:
+                    edges.append({
+                        'source': gene,
+                        'target': metabolite,
+                        'correlation': corr
+                    })
+    
+    # Find strongest connections for each target
+    winners = []
+    edge_df = pd.DataFrame(edges)
+    if not edge_df.empty:
+        for target in edge_df['target'].unique():
+            target_edges = edge_df[edge_df['target'] == target]
+            if not target_edges.empty:
+                winner = target_edges.loc[target_edges['correlation'].abs().idxmax()]
+                winners.append(f"{winner['source']}_{winner['target']}")
+    
+    return winners
+
+def main(args):
+    # Setup logging
+    logger = setup_logger(args.verbose)
+    
+    logger.info(f"Starting corto network analysis in {args.mode} mode...")
+    
+    try:
+        # Load data
+        expression_df, metabolite_df = load_data(args.expression_file, args.metabolomics_file, logger)
+        
+        # Remove zero variance features
+        expression_df = remove_zero_variance(expression_df, logger)
+        metabolite_df = remove_zero_variance(metabolite_df, logger)
+        
+        # Calculate correlation threshold
+        r_threshold = p2r(args.p_threshold, len(metabolite_df.columns))
+        logger.info(f"Using correlation threshold: {r_threshold}")
+        
+        # Calculate initial correlations based on mode
+        if args.mode == 'corto':
+            edge_df = calculate_correlations_corto(
+                expression_df,
+                metabolite_df,
+                r_threshold,
+                logger
+            )
+        else:
+            edge_df = calculate_correlations_combined(
+                expression_df,
+                metabolite_df,
+                r_threshold,
+                logger
+            )
+
+        # Store valid pairs for bootstrapping
+        valid_pairs = set([f"{row['source']}_{row['target']}" for _, row in edge_df.iterrows()])
+        
+        # Initialize occurrence tracking using valid pairs
+        occurrences = pd.DataFrame({
+            'source': edge_df['source'],
+            'target': edge_df['target'],
+            'correlation': edge_df['correlation'],
+            'type': edge_df['type'],  # Now using type from edge_df
+            'occurrences': 0
+        })
+        occurrences.index = occurrences['source'] + '_' + occurrences['target']
+        
+        # Run bootstraps
+        logger.info(f"Running {args.nbootstraps} bootstraps...")
+        
+        with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
+            futures = [
+                executor.submit(
+                    bootstrap_network if args.mode == 'corto' else bootstrap_network_combined,
+                    expression_df,
+                    metabolite_df,
+                    r_threshold,
+                    i,
+                    logger
+                )
+                for i in range(args.nbootstraps)
+            ]
+            
+            bootstrap_winners = []
+            for future in futures:
+                # Only keep winners that were in original valid pairs
+                winners = future.result()
+                valid_winners = [w for w in winners if w in valid_pairs]
+                bootstrap_winners.extend(valid_winners)
+        
+        # Update occurrences
+        winner_counts = pd.Series(bootstrap_winners).value_counts()
+        occurrences.loc[winner_counts.index, 'occurrences'] += winner_counts
+        
+        # Calculate final likelihoods
+        occurrences['likelihood'] = occurrences['occurrences'] / args.nbootstraps
+        
+        # Create regulon object
+        regulon = {}
+        for source in occurrences['source'].unique():
+            source_edges = occurrences[occurrences['source'] == source]
+            if args.mode == 'corto':
+                regulon[source] = {
+                    'tfmode': dict(zip(source_edges['target'], source_edges['correlation'])),
+                    'likelihood': dict(zip(source_edges['target'], source_edges['likelihood']))
+                }
+            else:
+                # For combined mode, include edge types
+                regulon[source] = {
+                    'tfmode': dict(zip(source_edges['target'], source_edges['correlation'])),
+                    'likelihood': dict(zip(source_edges['target'], source_edges['likelihood'])),
+                    'edge_types': dict(zip(source_edges['target'], source_edges['type']))
+                }
+        
+        # Save results
+        logger.info("Saving results...")
+        
+        # Save network with additional stats
+        network_file = f'corto_network_{args.mode}.csv'
+        regulon_file = f'corto_regulon_{args.mode}.txt'
+        
+        occurrences['support'] = occurrences['occurrences'] / args.nbootstraps 
+        occurrences['abs_correlation'] = abs(occurrences['correlation'])
+        
+        # Remove prefixes if in combined mode
+        if args.mode == 'combined':
+            occurrences['source'] = occurrences['source'].str.replace('GENE_', '').str.replace('MET_', '')
+            occurrences['target'] = occurrences['target'].str.replace('GENE_', '').str.replace('MET_', '')
+        
+        occurrences.sort_values('abs_correlation', ascending=False).to_csv(network_file)
+        
+        # Save regulon with pretty formatting
+        with open(regulon_file, 'w') as f:
+            f.write(f"# Corto Regulon Analysis\n")
+            f.write(f"# Mode: {args.mode}\n")
+            f.write(f"# Parameters:\n")
+            f.write(f"#   p-threshold: {args.p_threshold}\n")
+            f.write(f"#   bootstraps: {args.nbootstraps}\n")
+            f.write(f"#   edges found: {len(occurrences)}\n\n")
+            
+            for source, data in regulon.items():
+                source_name = source.replace('GENE_', '').replace('MET_', '') if args.mode == 'combined' else source
+                f.write(f"\n{source_name}:\n")
+                for key, values in data.items():
+                    f.write(f"  {key}:\n")
+                    if key == 'edge_types':
+                        for target, value in values.items():
+                            target_name = target.replace('GENE_', '').replace('MET_', '')
+                            f.write(f"    {target_name}: {value}\n")
+                    else:
+                        sorted_items = sorted(values.items(), key=lambda x: abs(x[1]), reverse=True)
+                        for target, value in sorted_items:
+                            target_name = target.replace('GENE_', '').replace('MET_', '') if args.mode == 'combined' else target
+                            f.write(f"    {target_name}: {value:.4f}\n")
+        
+        logger.info("Analysis complete!")
+        if args.mode == 'corto':
+            logger.info(f"Found {len(occurrences)} significant gene-metabolite relationships")
+        else:
+            relationship_counts = occurrences['type'].value_counts()
+            for rel_type, count in relationship_counts.items():
+                logger.info(f"Found {count} significant {rel_type} relationships")
+        
+        logger.info(f"Results saved to {network_file} and {regulon_file}")
+        
+    except Exception as e:
+        logger.error(f"Error during analysis: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Run corto network analysis')
+
+    parser.add_argument('--mode', choices=['corto', 'combined'], default='corto',
+                      help='Analysis mode - either corto or combined (default: corto)')    
+    parser.add_argument('--expression_file', required=True,
+                      help='Path to expression data file')
+    parser.add_argument('--metabolomics_file', required=True,  
+                      help='Path to metabolomics data file')
+    parser.add_argument('--p_threshold', type=float, default=1e-30,
+                      help='P-value threshold') 
+    parser.add_argument('--nbootstraps', type=int, default=100,
+                      help='Number of bootstrap iterations')
+    parser.add_argument('--nthreads', type=int, default=4,
+                      help='Number of parallel threads')
+    parser.add_argument('--verbose', action='store_true',
+                      help='Print verbose output')
+    
+    args = parser.parse_args()
+    main(args)