Added custom data prep and matrix combination steps meant to perform similar but improved functions to the R code. Added readme detailing the code.
This commit is contained in:
410
corto-matrix-combination.py
Normal file
410
corto-matrix-combination.py
Normal file
@@ -0,0 +1,410 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from typing import Dict, List, Optional, Tuple, Literal
|
||||
import logging
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import argparse
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
def setup_logger(verbose: bool = False) -> logging.Logger:
|
||||
"""Setup logging configuration"""
|
||||
logger = logging.getLogger('CortoNetwork')
|
||||
logger.setLevel(logging.INFO if verbose else logging.WARNING)
|
||||
|
||||
# Create console handler with formatting
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
return logger
|
||||
|
||||
def load_data(expression_file: str, metabolite_file: str, logger: logging.Logger) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||||
"""Load and preprocess data files"""
|
||||
logger.info("Loading expression data...")
|
||||
exp_df = pd.read_csv(expression_file)
|
||||
|
||||
# Set multi-index and convert to numeric matrix
|
||||
logger.info("Processing expression data...")
|
||||
exp_df.set_index(['gene_id', 'transcript_id'], inplace=True)
|
||||
exp_df = exp_df.apply(pd.to_numeric, errors='coerce')
|
||||
exp_df.index = [f"{idx[0]}_{idx[1]}" for idx in exp_df.index]
|
||||
|
||||
logger.info(f"Expression matrix shape: {exp_df.shape}")
|
||||
|
||||
# Load metabolite data
|
||||
logger.info("Loading metabolomics data...")
|
||||
met_df = pd.read_csv(metabolite_file)
|
||||
|
||||
logger.info("Processing metabolomics data...")
|
||||
met_df.set_index('CCLE_ID', inplace=True)
|
||||
met_df = met_df.select_dtypes(include=[np.number])
|
||||
met_df = met_df.T
|
||||
|
||||
logger.info(f"Metabolomics matrix shape: {met_df.shape}")
|
||||
|
||||
# Align samples
|
||||
common_samples = list(set(exp_df.columns) & set(met_df.columns))
|
||||
if not common_samples:
|
||||
raise ValueError("No common samples between matrices")
|
||||
|
||||
logger.info(f"Found {len(common_samples)} common samples")
|
||||
exp_df = exp_df[common_samples]
|
||||
met_df = met_df[common_samples]
|
||||
|
||||
return exp_df, met_df
|
||||
|
||||
def remove_zero_variance(df: pd.DataFrame, logger: logging.Logger) -> pd.DataFrame:
|
||||
"""Remove features with zero variance"""
|
||||
logger.info(f"Checking variance in matrix of shape {df.shape}")
|
||||
vars = df.var(axis=1)
|
||||
keep = vars[vars > 0].index
|
||||
logger.info(f"Keeping {len(keep)} features with non-zero variance")
|
||||
return df.loc[keep]
|
||||
|
||||
def p2r(p: float, n: int) -> float:
|
||||
"""Convert p-value to correlation coefficient threshold"""
|
||||
t = stats.t.ppf(p/2, df=n-2, loc=0, scale=1)
|
||||
r = np.sqrt((t**2)/(n-2 + t**2))
|
||||
return r
|
||||
|
||||
def calculate_correlations_corto(expression_df: pd.DataFrame,
|
||||
metabolite_df: pd.DataFrame,
|
||||
r_threshold: float,
|
||||
logger: logging.Logger) -> pd.DataFrame:
|
||||
"""Calculate correlations keeping matrices separate (corto approach)"""
|
||||
logger.info("Calculating correlations...")
|
||||
|
||||
# Calculate correlations in chunks to save memory
|
||||
chunk_size = 1000 # Adjust based on available memory
|
||||
n_chunks = int(np.ceil(len(expression_df) / chunk_size))
|
||||
|
||||
edges = []
|
||||
for i in range(n_chunks):
|
||||
start_idx = i * chunk_size
|
||||
end_idx = min((i + 1) * chunk_size, len(expression_df))
|
||||
|
||||
logger.info(f"Processing chunk {i+1}/{n_chunks}")
|
||||
exp_chunk = expression_df.iloc[start_idx:end_idx]
|
||||
|
||||
# Calculate correlations for this chunk
|
||||
chunk_corr = pd.DataFrame(
|
||||
np.corrcoef(exp_chunk, metabolite_df)[
|
||||
:exp_chunk.shape[0],
|
||||
exp_chunk.shape[0]:
|
||||
],
|
||||
index=exp_chunk.index,
|
||||
columns=metabolite_df.index
|
||||
)
|
||||
|
||||
# Find significant correlations
|
||||
for gene in chunk_corr.index:
|
||||
for metabolite in chunk_corr.columns:
|
||||
corr = chunk_corr.loc[gene, metabolite]
|
||||
if abs(corr) >= r_threshold:
|
||||
edges.append({
|
||||
'source': gene,
|
||||
'target': metabolite,
|
||||
'correlation': corr,
|
||||
'type': 'gene_metabolite'
|
||||
})
|
||||
|
||||
# Clear memory
|
||||
del chunk_corr
|
||||
|
||||
logger.info(f"Found {len(edges)} significant correlations")
|
||||
return pd.DataFrame(edges)
|
||||
|
||||
def calculate_correlations_combined(expression_df: pd.DataFrame,
|
||||
metabolite_df: pd.DataFrame,
|
||||
r_threshold: float,
|
||||
logger: logging.Logger) -> pd.DataFrame:
|
||||
"""Calculate correlations using combined matrix approach"""
|
||||
logger.info("Combining matrices...")
|
||||
|
||||
# Add prefixes and combine
|
||||
exp_prefixed = expression_df.copy()
|
||||
exp_prefixed.index = 'GENE_' + exp_prefixed.index
|
||||
|
||||
met_prefixed = metabolite_df.copy()
|
||||
met_prefixed.index = 'MET_' + met_prefixed.index
|
||||
|
||||
combined_df = pd.concat([exp_prefixed, met_prefixed])
|
||||
|
||||
logger.info("Calculating correlations...")
|
||||
|
||||
edges = []
|
||||
chunk_size = 1000
|
||||
n_chunks = int(np.ceil(len(combined_df) / chunk_size))
|
||||
|
||||
for i in range(n_chunks):
|
||||
start_idx = i * chunk_size
|
||||
end_idx = min((i + 1) * chunk_size, len(combined_df))
|
||||
|
||||
logger.info(f"Processing chunk {i+1}/{n_chunks}")
|
||||
chunk = combined_df.iloc[start_idx:end_idx]
|
||||
|
||||
chunk_corr = pd.DataFrame(
|
||||
np.corrcoef(chunk, combined_df)[
|
||||
:chunk.shape[0],
|
||||
chunk.shape[0]:
|
||||
],
|
||||
index=chunk.index,
|
||||
columns=combined_df.index
|
||||
)
|
||||
|
||||
for source in chunk_corr.index:
|
||||
for target in chunk_corr.columns:
|
||||
if source < target: # Only take upper triangle
|
||||
corr = chunk_corr.loc[source, target]
|
||||
if abs(corr) >= r_threshold:
|
||||
type = 'gene_gene' if 'GENE_' in source and 'GENE_' in target else \
|
||||
'metabolite_metabolite' if 'MET_' in source and 'MET_' in target else \
|
||||
'gene_metabolite'
|
||||
edges.append({
|
||||
'source': source,
|
||||
'target': target,
|
||||
'correlation': corr,
|
||||
'type': type
|
||||
})
|
||||
|
||||
del chunk_corr
|
||||
|
||||
logger.info(f"Found {len(edges)} significant correlations")
|
||||
return pd.DataFrame(edges)
|
||||
|
||||
def bootstrap_network(expression_df: pd.DataFrame,
|
||||
metabolite_df: pd.DataFrame,
|
||||
r_threshold: float,
|
||||
seed: int,
|
||||
logger: logging.Logger) -> List[str]:
|
||||
"""Bootstrap for a single iteration"""
|
||||
np.random.seed(seed)
|
||||
|
||||
# Sample with replacement
|
||||
sample_idx = np.random.choice(
|
||||
expression_df.shape[1],
|
||||
size=expression_df.shape[1],
|
||||
replace=True
|
||||
)
|
||||
|
||||
# Sample matrices
|
||||
boot_expression = expression_df.iloc[:, sample_idx]
|
||||
boot_metabolite = metabolite_df.iloc[:, sample_idx]
|
||||
|
||||
# Calculate correlations for bootstrap sample
|
||||
edges = []
|
||||
chunk_size = 1000 # Process in chunks to save memory
|
||||
n_chunks = int(np.ceil(len(boot_expression) / chunk_size))
|
||||
|
||||
for i in range(n_chunks):
|
||||
start_idx = i * chunk_size
|
||||
end_idx = min((i + 1) * chunk_size, len(boot_expression))
|
||||
|
||||
exp_chunk = boot_expression.iloc[start_idx:end_idx]
|
||||
|
||||
# Calculate correlations for this chunk
|
||||
chunk_corr = pd.DataFrame(
|
||||
np.corrcoef(exp_chunk, boot_metabolite)[
|
||||
:exp_chunk.shape[0],
|
||||
exp_chunk.shape[0]:
|
||||
],
|
||||
index=exp_chunk.index,
|
||||
columns=boot_metabolite.index
|
||||
)
|
||||
|
||||
# Find significant correlations
|
||||
for gene in chunk_corr.index:
|
||||
for metabolite in chunk_corr.columns:
|
||||
corr = chunk_corr.loc[gene, metabolite]
|
||||
if abs(corr) >= r_threshold:
|
||||
edges.append({
|
||||
'source': gene,
|
||||
'target': metabolite,
|
||||
'correlation': corr
|
||||
})
|
||||
|
||||
# Find strongest connections for each target
|
||||
winners = []
|
||||
edge_df = pd.DataFrame(edges)
|
||||
if not edge_df.empty:
|
||||
for target in edge_df['target'].unique():
|
||||
target_edges = edge_df[edge_df['target'] == target]
|
||||
if not target_edges.empty:
|
||||
winner = target_edges.loc[target_edges['correlation'].abs().idxmax()]
|
||||
winners.append(f"{winner['source']}_{winner['target']}")
|
||||
|
||||
return winners
|
||||
|
||||
def main(args):
|
||||
# Setup logging
|
||||
logger = setup_logger(args.verbose)
|
||||
|
||||
logger.info(f"Starting corto network analysis in {args.mode} mode...")
|
||||
|
||||
try:
|
||||
# Load data
|
||||
expression_df, metabolite_df = load_data(args.expression_file, args.metabolomics_file, logger)
|
||||
|
||||
# Remove zero variance features
|
||||
expression_df = remove_zero_variance(expression_df, logger)
|
||||
metabolite_df = remove_zero_variance(metabolite_df, logger)
|
||||
|
||||
# Calculate correlation threshold
|
||||
r_threshold = p2r(args.p_threshold, len(metabolite_df.columns))
|
||||
logger.info(f"Using correlation threshold: {r_threshold}")
|
||||
|
||||
# Calculate initial correlations based on mode
|
||||
if args.mode == 'corto':
|
||||
edge_df = calculate_correlations_corto(
|
||||
expression_df,
|
||||
metabolite_df,
|
||||
r_threshold,
|
||||
logger
|
||||
)
|
||||
else:
|
||||
edge_df = calculate_correlations_combined(
|
||||
expression_df,
|
||||
metabolite_df,
|
||||
r_threshold,
|
||||
logger
|
||||
)
|
||||
|
||||
# Store valid pairs for bootstrapping
|
||||
valid_pairs = set([f"{row['source']}_{row['target']}" for _, row in edge_df.iterrows()])
|
||||
|
||||
# Initialize occurrence tracking using valid pairs
|
||||
occurrences = pd.DataFrame({
|
||||
'source': edge_df['source'],
|
||||
'target': edge_df['target'],
|
||||
'correlation': edge_df['correlation'],
|
||||
'type': edge_df['type'], # Now using type from edge_df
|
||||
'occurrences': 0
|
||||
})
|
||||
occurrences.index = occurrences['source'] + '_' + occurrences['target']
|
||||
|
||||
# Run bootstraps
|
||||
logger.info(f"Running {args.nbootstraps} bootstraps...")
|
||||
|
||||
with ProcessPoolExecutor(max_workers=args.nthreads) as executor:
|
||||
futures = [
|
||||
executor.submit(
|
||||
bootstrap_network if args.mode == 'corto' else bootstrap_network_combined,
|
||||
expression_df,
|
||||
metabolite_df,
|
||||
r_threshold,
|
||||
i,
|
||||
logger
|
||||
)
|
||||
for i in range(args.nbootstraps)
|
||||
]
|
||||
|
||||
bootstrap_winners = []
|
||||
for future in futures:
|
||||
# Only keep winners that were in original valid pairs
|
||||
winners = future.result()
|
||||
valid_winners = [w for w in winners if w in valid_pairs]
|
||||
bootstrap_winners.extend(valid_winners)
|
||||
|
||||
# Update occurrences
|
||||
winner_counts = pd.Series(bootstrap_winners).value_counts()
|
||||
occurrences.loc[winner_counts.index, 'occurrences'] += winner_counts
|
||||
|
||||
# Calculate final likelihoods
|
||||
occurrences['likelihood'] = occurrences['occurrences'] / args.nbootstraps
|
||||
|
||||
# Create regulon object
|
||||
regulon = {}
|
||||
for source in occurrences['source'].unique():
|
||||
source_edges = occurrences[occurrences['source'] == source]
|
||||
if args.mode == 'corto':
|
||||
regulon[source] = {
|
||||
'tfmode': dict(zip(source_edges['target'], source_edges['correlation'])),
|
||||
'likelihood': dict(zip(source_edges['target'], source_edges['likelihood']))
|
||||
}
|
||||
else:
|
||||
# For combined mode, include edge types
|
||||
regulon[source] = {
|
||||
'tfmode': dict(zip(source_edges['target'], source_edges['correlation'])),
|
||||
'likelihood': dict(zip(source_edges['target'], source_edges['likelihood'])),
|
||||
'edge_types': dict(zip(source_edges['target'], source_edges['type']))
|
||||
}
|
||||
|
||||
# Save results
|
||||
logger.info("Saving results...")
|
||||
|
||||
# Save network with additional stats
|
||||
network_file = f'corto_network_{args.mode}.csv'
|
||||
regulon_file = f'corto_regulon_{args.mode}.txt'
|
||||
|
||||
occurrences['support'] = occurrences['occurrences'] / args.nbootstraps
|
||||
occurrences['abs_correlation'] = abs(occurrences['correlation'])
|
||||
|
||||
# Remove prefixes if in combined mode
|
||||
if args.mode == 'combined':
|
||||
occurrences['source'] = occurrences['source'].str.replace('GENE_', '').str.replace('MET_', '')
|
||||
occurrences['target'] = occurrences['target'].str.replace('GENE_', '').str.replace('MET_', '')
|
||||
|
||||
occurrences.sort_values('abs_correlation', ascending=False).to_csv(network_file)
|
||||
|
||||
# Save regulon with pretty formatting
|
||||
with open(regulon_file, 'w') as f:
|
||||
f.write(f"# Corto Regulon Analysis\n")
|
||||
f.write(f"# Mode: {args.mode}\n")
|
||||
f.write(f"# Parameters:\n")
|
||||
f.write(f"# p-threshold: {args.p_threshold}\n")
|
||||
f.write(f"# bootstraps: {args.nbootstraps}\n")
|
||||
f.write(f"# edges found: {len(occurrences)}\n\n")
|
||||
|
||||
for source, data in regulon.items():
|
||||
source_name = source.replace('GENE_', '').replace('MET_', '') if args.mode == 'combined' else source
|
||||
f.write(f"\n{source_name}:\n")
|
||||
for key, values in data.items():
|
||||
f.write(f" {key}:\n")
|
||||
if key == 'edge_types':
|
||||
for target, value in values.items():
|
||||
target_name = target.replace('GENE_', '').replace('MET_', '')
|
||||
f.write(f" {target_name}: {value}\n")
|
||||
else:
|
||||
sorted_items = sorted(values.items(), key=lambda x: abs(x[1]), reverse=True)
|
||||
for target, value in sorted_items:
|
||||
target_name = target.replace('GENE_', '').replace('MET_', '') if args.mode == 'combined' else target
|
||||
f.write(f" {target_name}: {value:.4f}\n")
|
||||
|
||||
logger.info("Analysis complete!")
|
||||
if args.mode == 'corto':
|
||||
logger.info(f"Found {len(occurrences)} significant gene-metabolite relationships")
|
||||
else:
|
||||
relationship_counts = occurrences['type'].value_counts()
|
||||
for rel_type, count in relationship_counts.items():
|
||||
logger.info(f"Found {count} significant {rel_type} relationships")
|
||||
|
||||
logger.info(f"Results saved to {network_file} and {regulon_file}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during analysis: {str(e)}")
|
||||
raise
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run corto network analysis')
|
||||
|
||||
parser.add_argument('--mode', choices=['corto', 'combined'], default='corto',
|
||||
help='Analysis mode - either corto or combined (default: corto)')
|
||||
parser.add_argument('--expression_file', required=True,
|
||||
help='Path to expression data file')
|
||||
parser.add_argument('--metabolomics_file', required=True,
|
||||
help='Path to metabolomics data file')
|
||||
parser.add_argument('--p_threshold', type=float, default=1e-30,
|
||||
help='P-value threshold')
|
||||
parser.add_argument('--nbootstraps', type=int, default=100,
|
||||
help='Number of bootstrap iterations')
|
||||
parser.add_argument('--nthreads', type=int, default=4,
|
||||
help='Number of parallel threads')
|
||||
parser.add_argument('--verbose', action='store_true',
|
||||
help='Print verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Reference in New Issue
Block a user