diff --git a/R/ParallelEvolCCM.R b/R/ParallelEvolCCM.R index 7918c7e..fca9d82 100644 --- a/R/ParallelEvolCCM.R +++ b/R/ParallelEvolCCM.R @@ -1,631 +1,630 @@ -#!/usr/bin/env Rscript - -### ParallelEvolCCM.R -### Hosted at: https://github.com/beiko-lab/arete/blob/master/bin/ParallelEvolCCM.R -### Version 1.0, released *** June 2024 -### Released under MIT License (https://opensource.org/license/mit) -### Note: igraph requires the OpenBlas library, which can be installed with: -### sudo apt-get install libopenblas-dev -### devtools: sudo apt-get install libssl-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev - -# List of CRAN packages to check and install -cran_packages <- - c("ape", - "dplyr", - "remotes", - "phytools", - "foreach", - "doParallel", - "gplots") - -for (package in cran_packages) { - if (!suppressPackageStartupMessages(require(package, character.only = TRUE))) { - install.packages(package) - } - suppressPackageStartupMessages(library(package, character.only = TRUE)) -} - -if (!require(phytools)) { - remotes::install_github("liamrevell/phytools") -} - -if (!require(evolCCM)) { - remotes::install_github("beiko-lab/evolCCM") -} - -suppressPackageStartupMessages(library(evolCCM)) - - -##################################################################### -### FUNCTION PARSE_ARGS -### Parse the command-line arguments. -### args: the arguments -### RETURN a list that contains values for all the defined arguments. -##################################################################### - -parse_args <- function(args) { - # Check if no arguments are provided or if '-h' flag is given - if (length(args) == 0 || any(args == "-h")) { - cat( - "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...] [--cores ] [--min_abundance <0.0-1.0>] [--max_abundance <0.0-1.0>]\n" - ) - cat("Options:\n") - cat(" --intree : Path to the tree file (required).\n") - cat(" --intable : Path to the table file (required).\n") - cat( - " --compare_from : Comma-separated list of values for comparison (optional).\n" - ) - cat( - " --compare_to : Comma-separated list of values for comparison (optional).\n" - ) - cat( - " --cores : Specify number of cores for parallel processing or '-1' for all cores (optional, default is 1).\n" - ) - cat( - " --min_abundance <0.0-1.0> : Minimum abundance proportion threshold (optional, default is 0.0).\n" - ) - cat( - " --max_abundance <0.0-1.0> : Maximum abundance proportion threshold (optional, default is 1.0).\n" - ) - cat( - " --show_nans : Show NaNs in matrix (default is to convert non-converging X^2 values to 0 and p-values to 1.\n" - ) - cat(" -h : Show this help message.\n") - quit(save = "no") - } - - cat("\n\n*********************** PARALLELEVOLCCM ***********************\n\n") - - cat("Parsing arguments...\n") - - # Function to parse command line arguments without '=' - parseArg <- function(argname, options) { - index <- which(options == argname) - if (length(index) > 0 && index < length(options)) { - return(options[index + 1]) - } - NULL - } - - # Parse the CL arguments - inputTree <- parseArg("--intree", args) - inputProfile <- parseArg("--intable", args) - compare_to_str <- parseArg("--compare_to", args) - min_abundance_str <- parseArg("--min_abundance", args) - max_abundance_str <- parseArg("--max_abundance", args) - cores_str <- parseArg("--cores", args) - show_nans <- "--show_nans" %in% args - - # Parse the optional --compare_from and --compare_to arguments - compare_from_str <- parseArg("--compare_from", args) - if (!is.null(compare_from_str)) { - compare_from_vector <- strsplit(compare_from_str, ",")[[1]] - } else { - compare_from_vector <- NULL - } - - if (!is.null(compare_to_str)) { - compare_to_vector <- strsplit(compare_to_str, ",")[[1]] - } else { - compare_to_vector <- NULL - } - - # Set default values - min_abund <- 0.0 - max_abund <- 1.0 - - # Validate and assign min_abundance if specified - if (!is.null(min_abundance_str)) { - temp_val <- as.numeric(min_abundance_str) - if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { - min_abund <- temp_val - } else { - stop("--min_abundance must be a proportional value between 0.0 and 1.0", - call. = FALSE) - } - } - - # Validate and assign max_abundance if specified - if (!is.null(max_abundance_str)) { - temp_val <- as.numeric(max_abundance_str) - if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { - max_abund <- temp_val - } else { - stop("--max_abundance must be a proportional value between 0.0 and 1.0", - call. = FALSE) - } - } - - # Check the relationship between min_abund and max_abund - if (min_abund > max_abund) { - stop("--min_abundance value must be less than or equal to --max_abundance value", - call. = FALSE) - } - - # Determine the parallel processing parameters - if (is.null(cores_str)) { - num_cores <- 1 - } else if (cores_str == "-1") { - num_cores <- detectCores() - } else { - num_cores <- as.integer(cores_str) - if (is.na(num_cores) || num_cores <= 0) { - stop("--cores argument must be a positive integer or '-1' for all cores", - call. = FALSE) - } - } - - # Check if both --intree and --intable are provided - if (is.null(inputTree) || is.null(inputProfile)) { - stop( - "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...]", - call. = FALSE - ) - } - - parsed_list <- list( - "inputTree" = inputTree, - "inputProfile" = inputProfile, - "compare_from_vector" = compare_from_vector, - "compare_to_vector" = compare_to_vector, - "min_abund" = min_abund, - "max_abund" = max_abund, - "show_nans" = show_nans, - "num_cores" = num_cores - ) - - return(parsed_list) -} - -########################################################################## -### FUNCTION MODIFY_TREE -### Apply fixes, if necessary, to make the tree suitable for CCM analysis. -### tree: the tree -### fix_multifurcations, coerce_branches: self-explanatory Booleans -### RETURNS the corrected tree -########################################################################## - -modify_tree <- function(tree, - fix_multifurcations, - coerce_branches) { - if (!is.rooted(tree)) { - cat(noquote( - "*** Input tree is not rooted! Applying midpoint rooting algorithm. ***\n" - )) - tree <- midpoint.root(tree) - } - - ### Arbitrarily resolve multifurcations and zero-length branches ### - if (!is.binary(tree)) { - if (fix_multifurcations) { - cat(noquote("*** Input tree is not binary! Forcing the issue. ***\n")) - tree <- multi2di(tree) - } else { - stop( - "Tree contains multifurcations and \"fix_multifurcations\" is set to FALSE. Exiting..." - ) - } - } - - ### Coerce branches if necessary ### - if (coerce_branches) { - short_branch_count <- sum(tree$edge.length < min_branch_length) - if (short_branch_count > 0) { - notification_string = paste( - short_branch_count, - " of ", - nrow(tree$edge), - " edges are less than the specified threshold of ", - min_branch_length, - ".", - sep = "" - ) - warning( - paste( - notification_string, - "Setting these branches to equal the threshold; this will affect the results!" - ) - ) - tree$edge.length[tree$edge.length < min_branch_length] <- - min_branch_length - } - } - return(tree) -} - -######################################################################################## -### FUNCTION CHECK_AND_FILTER_PROFILE -### Check for consistency between the profile and the tree, and apply filtering criteria -### aprofile: the phylogenetic profile -### tree: the phylogenetic tree -### min_abund: minimum abundance threshold -### max_abund: maximum abundance threshold -### RETURN a filtered profile, as long as nothing is broken -######################################################################################## - -check_and_filter_profile <- - function(aprofile, tree, min_abund, max_abund) { - # Get the leaf labels from the phylogenetic tree - tree_labels <- tree$tip.label - - # Get the row labels from the dataframe - df_labels <- rownames(aprofile) - - mismatch <- 0 - # Find labels that are in the tree but not in the dataframe - missing_in_df <- setdiff(tree_labels, df_labels) - if (length(missing_in_df) > 0) { - cat("Labels in the tree but not in the dataframe:", quote = FALSE) - cat(missing_in_df) - mismatch = 2 - } - - # Find labels that are in the dataframe but not in the tree - missing_in_tree <- setdiff(df_labels, tree_labels) - if (length(missing_in_tree) > 0) { - cat("Labels in the dataframe but not in the tree:", quote = FALSE) - cat(missing_in_tree) - mismatch = 1 - } - - if (mismatch == 1) { - stop("Exiting...") - } - - cat("All names match between tree and matrix.\n") - - ### ABUNDANCE FILTER ### - - aprofile <- aprofile[tree$tip.label,] - aprofile[aprofile > 1] <- 1 - - numFeatures <- dim(aprofile)[2] - cat(paste("Total number of features: ", numFeatures, "\n")) - - ### Abundance filter: subset the dataframe to include only those columns that satisfy the proportion criteria ### - proportion_ones <- - apply(aprofile, 2, function(col) - sum(col == 1) / length(col)) - aprofile <- - aprofile[, proportion_ones >= min_abund & - proportion_ones <= max_abund] - numFeatures <- dim(aprofile)[2] - cat(paste( - "Total number of features post abundance filtering: ", - numFeatures, - "\n" - )) - - ### CLADE FILTERING: Remove any feature that maps perfectly onto a clade in the rooted phylogenetic tree ### - # This function will recursively explore the tree and return the clades - # Function to get all descendant tips of a node - #if(clade_filter) { - if (FALSE) { - get_descendant_tips <- function(tree, node) { - if (node <= length(tree$tip.label)) { - return(tree$tip.label[node]) - } else { - descend <- which(tree$edge[, 1] == node) - tips <- c() - for (i in descend) { - tips <- c(tips, get_descendant_tips(tree, tree$edge[i, 2])) - } - return(tips) - } - } - - # Get the clades for all internal nodes - clades <- - lapply((length(tree$tip.label) + 1):(length(tree$tip.label) + tree$Nnode), - function(node) - get_descendant_tips(tree, node)) - - # Function to check if a feature is unique to a clade - check_feature <- function(feature) { - for (clade in clades) { - clade_data <- aprofile[unlist(clade), feature] - outside_clade_data <- - aprofile[setdiff(rownames(aprofile), unlist(clade)), feature] - - if (all(clade_data == 1, na.rm = TRUE) & - all(outside_clade_data == 0, na.rm = TRUE)) { - return(TRUE) - } - } - return(FALSE) - } - - for (feature in colnames(aprofile)) { - if (check_feature(feature)) { - filtered_data <- filtered_data %>% select(-feature) - } - } - } - return(aprofile) - } - -#################################################################### -### FUNCTION PROCESS_PAIR -### Compute EvolCCM distance for a pair of profiles -### i,j: indices -### filtered_data: dataframe with phylogenetic profiles -### tree: the phylogenetic tree -### RETURNS a row containing CCM statistics -#################################################################### - -process_pair <- function(i, j, filtered_data, tree) { - res <- tryCatch({ - #if ( - if (j %% 50 == 0) { - print(paste(i, "vs", j)) - } - - # Run EstimateCCM for the current pair - aE <- - EstimateCCM(filtered_data[, c(i, j)], phytree = tree, trace = FALSE) - estimatedRates <- aE$nlm.par - - # Retrieve the score and pvalue - paE <- ProcessAE(aE) - - interact_score <- estimatedRates[5] / paE$hessianSE[5] - interact_pval <- 2 * (1 - pnorm(abs(interact_score))) - - res <- c(aE$nlm.par, interact_score, interact_pval) - - # Return the result along with i, j, and column names - list( - i = i, - j = j, - colnames = colnames(filtered_data[, c(i, j)]), - res = res - ) - #} - }, error = function(e) { - # Return NULL in case of an error - NULL - }) - - return(res) -} - -########################################################################### -### FUNCTION CONSTRUCTMATRIX -### Build square matrices that represent similarities/distances of profiles -### lines: the output lines generated by CCM -### outputFile: the output file -### column: the column in 'lines' to use for matrix construction -### RETURNS nothing, WRITES the resulting matrix to a file -########################################################################## - -constructMatrix <- function(lines, outputFile, column) { - # Initialize an empty list to store the entities - entities <- list() - - for (line in lines) { - # Parse the line into fields - fields <- strsplit(line, "\t")[[1]] - # Extract the entities - entity1 <- fields[1] - entity2 <- fields[2] - - # If the entities are new, add them to the list - if (!entity1 %in% entities) { - entities <- c(entities, entity1) - } - if (!entity2 %in% entities) { - entities <- c(entities, entity2) - } - } - - # Initialize an empty matrix to store the distances - distances <- - matrix(nrow = length(entities), ncol = length(entities)) - - for (line in lines) { - # Parse the line into fields - fields <- strsplit(line, "\t")[[1]] - # Extract the entities and the distance - entity1 <- fields[1] - entity2 <- fields[2] - distance <- as.numeric(fields[column]) - - # Add the distance to the matrix - distances[entities == entity1, entities == entity2] <- distance - distances[entities == entity2, entities == entity1] <- distance - } - - # Convert the matrix to a data frame and set the row and column names - distances <- as.data.frame(distances) - rownames(distances) <- entities - colnames(distances) <- entities - - # Write the matrix to a file - write.table(distances, - file = outputFile, - sep = "\t", - quote = FALSE) - -} - -#################### -### MAIN PROGRAM ### -#################### - - -########################################## FILTERS ########################################## - -### Remove any profile that maps perfectly onto a clade in the tree. In practice this doesn't do much ### -clade_filter = FALSE - -### Apply pairwise heuristic based on some criterion: do not perform full comparison if heuristic doesn't satisfy the threshold ### -### (CURRENTLY NOT IMPLEMENTED: mutual information and Euclidean were bad, UniFrac might be worth a try) ### -heuristic_filter = FALSE -heuristic_threshold = 0.001 - -### Coerce branches to a minimum threshold length (ALTERNATIVE: multiply all branch lengths by the minimum amount needed to get to threshold) -coerce_branches = FALSE -#min_branch_length = 0.001 - -### Coerce multifurcations to bifurcations (alternative is to stop execution) -fix_multifurcations = TRUE - -######################################## PARSE CL ARGUMENTS ######################################## - -args <- commandArgs(trailingOnly = TRUE) -parsed <- parse_args(args) - -### Output file name (could be changed to a command-line argument) ### -outputTree <- - paste("EvolCCM_", basename(parsed$inputTree), sep = "") -outputFile <- - paste("EvolCCM_", basename(parsed$inputProfile), sep = "") - -###################### READ THE TREE FILE AND FIX IF NECESSARY ###################### - -cat("\nReading tree...\n") -tree <- read.tree(parsed$inputTree) -tree <- modify_tree(tree, fix_multifurcations, coerce_branches) -write.tree(tree, outputTree) - -###################### READ THE PROFILE AND APPLY FILTERS ###################### - -### Read the profile file ### - -cat("\nReading profile...\n") -aprofile <- read.csv(parsed$inputProfile, row.names = 1, sep = "\t") -filtered_data <- - check_and_filter_profile(aprofile, tree, parsed$min_abund, parsed$max_abund) -numFeatures <- dim(filtered_data)[2] - -######################## PARALLELIZED NESTED LOOP FOR PAIRWISE PROFILE COMPARISONS ######################## - -cat("\nInitiating comparisons...\n") -registerDoParallel(cores = parsed$num_cores) - -compare_from_vector <- parsed$compare_from_vector -compare_to_vector <- parsed$compare_to_vector - -results <- - foreach( - i = 1:(numFeatures - 1), - .combine = 'c', - .multicombine = TRUE - ) %dopar% { - i_name <- colnames(filtered_data)[i] - - # If compare_from_str is defined, then check the condition. If not, then set to TRUE by default. - i_condition <- ifelse(!is.null(compare_from_vector), - any(sapply(compare_from_vector, function(x) { - startsWith(i_name, x) - })), - TRUE) - - if (i_condition) { - print(paste("Processing feature #", i, " (", i_name, ")", sep = ""), - quote = FALSE) - - sapply((i + 1):numFeatures, function(j) { - j_name <- colnames(filtered_data)[j] - - # If compare_to_str is defined, then check the condition. If not, then set to TRUE by default. - j_condition <- ifelse(!is.null(compare_to_vector), - any(sapply(compare_to_vector, function(x) { - startsWith(j_name, x) - })), - TRUE) - - if (j_condition) { - return(process_pair(i, j, filtered_data, tree)) - } else { - return(NULL) - } - }, simplify = FALSE) - } else { - return(NULL) - } - } - -cat("\nProfile comparisons complete!\n") - -################################# OUTPUT ##################################### - -cat("\nWriting output...\n") - -### Process the results and write them to chunks of output files ### -chunk_size <- 100 -output_file_pattern <- "output_temp_chunk_%d.txt" -chunk_index <- 1 -result_counter <- 1 - -for (result in results) { - if (!is.null(result)) { - if ((result_counter - 1) %% chunk_size == 0) { - temp_output_file <- sprintf(output_file_pattern, chunk_index) - chunk_index <- chunk_index + 1 - } - ### Fix NaNs unless user has specified otherwise - if (parsed$show_nans != 1) { - if (is.nan(result$res[6])) { - # print(result$res) - # print("Fixing NaN") - result$res[6] = 0 - result$res[7] = 1 - # print(result$res) - } - } - cat(paste(result$colnames, collapse = "\t"), - file = temp_output_file, - append = TRUE) - cat("\t", file = temp_output_file, append = TRUE) - cat(paste(result$res, collapse = "\t"), - file = temp_output_file, - append = TRUE) - cat("\n", file = temp_output_file, append = TRUE) - - result_counter <- result_counter + 1 - } -} - -# Merge the chunks of output files into the main output file -temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt") - -outputHeadings <- - paste( - "feature1", - "feature2", - "intrisic1", - "intrisic2", - "gainloss1", - "gainloss2", - "interaction", - "interact_score", - "interact_pval", - sep = "\t" - ) -outputFilegz = gzfile(outputFile, "w") -cat(outputHeadings, file = outputFilegz, sep = "\n") -for (temp_file in temp_files) { - # Read the contents of the temporary file - temp_contents <- readLines(temp_file) - - # Write the contents to the main output file - cat(temp_contents, - file = outputFilegz, - append = TRUE, - sep = "\n") - - # Remove the temporary file - file.remove(temp_file) -} - -# Read the lines from the input file -close(outputFilegz) -lines <- readLines(outputFile)[-1] -X2file = paste(outputFile, "X2", sep = ".") -Pfile = paste(outputFile, "pvals", sep = ".") - -constructMatrix(lines, X2file, 8) -constructMatrix(lines, Pfile, 9) - -cat("ParallelEvolCCM run done.\n\n") +# #!/usr/bin/env Rscript +# +# ### ParallelEvolCCM.R +# ### Hosted at: https://github.com/beiko-lab/arete/blob/master/bin/ParallelEvolCCM.R +# ### Version 1.0, released *** June 2024 +# ### Released under MIT License (https://opensource.org/license/mit) +# ### Note: igraph requires the OpenBlas library, which can be installed with: +# ### sudo apt-get install libopenblas-dev +# ### devtools: sudo apt-get install libssl-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev +# +# # List of CRAN packages to check and install +# cran_packages <- +# c("ape", +# "dplyr", +# "remotes", +# "phytools", +# "foreach", +# "doParallel", +# "gplots") +# +# for (package in cran_packages) { +# if (!suppressPackageStartupMessages(require(package, character.only = TRUE))) { +# install.packages(package) +# } +# suppressPackageStartupMessages(library(package, character.only = TRUE)) +# } +# +# if (!require(phytools)) { +# remotes::install_github("liamrevell/phytools") +# } +# +# #if (!require(evolCCM)) { +# # remotes::install_github("beiko-lab/evolCCM") +# #} +# #suppressPackageStartupMessages(library(evolCCM)) +# +# +# ##################################################################### +# ### FUNCTION PARSE_ARGS +# ### Parse the command-line arguments. +# ### args: the arguments +# ### RETURN a list that contains values for all the defined arguments. +# ##################################################################### +# +# parse_args <- function(args) { +# # Check if no arguments are provided or if '-h' flag is given +# if (length(args) == 0 || any(args == "-h")) { +# cat( +# "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...] [--cores ] [--min_abundance <0.0-1.0>] [--max_abundance <0.0-1.0>]\n" +# ) +# cat("Options:\n") +# cat(" --intree : Path to the tree file (required).\n") +# cat(" --intable : Path to the table file (required).\n") +# cat( +# " --compare_from : Comma-separated list of values for comparison (optional).\n" +# ) +# cat( +# " --compare_to : Comma-separated list of values for comparison (optional).\n" +# ) +# cat( +# " --cores : Specify number of cores for parallel processing or '-1' for all cores (optional, default is 1).\n" +# ) +# cat( +# " --min_abundance <0.0-1.0> : Minimum abundance proportion threshold (optional, default is 0.0).\n" +# ) +# cat( +# " --max_abundance <0.0-1.0> : Maximum abundance proportion threshold (optional, default is 1.0).\n" +# ) +# cat( +# " --show_nans : Show NaNs in matrix (default is to convert non-converging X^2 values to 0 and p-values to 1.\n" +# ) +# cat(" -h : Show this help message.\n") +# quit(save = "no") +# } +# +# cat("\n\n*********************** PARALLELEVOLCCM ***********************\n\n") +# +# cat("Parsing arguments...\n") +# +# # Function to parse command line arguments without '=' +# parseArg <- function(argname, options) { +# index <- which(options == argname) +# if (length(index) > 0 && index < length(options)) { +# return(options[index + 1]) +# } +# NULL +# } +# +# # Parse the CL arguments +# inputTree <- parseArg("--intree", args) +# inputProfile <- parseArg("--intable", args) +# compare_to_str <- parseArg("--compare_to", args) +# min_abundance_str <- parseArg("--min_abundance", args) +# max_abundance_str <- parseArg("--max_abundance", args) +# cores_str <- parseArg("--cores", args) +# show_nans <- "--show_nans" %in% args +# +# # Parse the optional --compare_from and --compare_to arguments +# compare_from_str <- parseArg("--compare_from", args) +# if (!is.null(compare_from_str)) { +# compare_from_vector <- strsplit(compare_from_str, ",")[[1]] +# } else { +# compare_from_vector <- NULL +# } +# +# if (!is.null(compare_to_str)) { +# compare_to_vector <- strsplit(compare_to_str, ",")[[1]] +# } else { +# compare_to_vector <- NULL +# } +# +# # Set default values +# min_abund <- 0.0 +# max_abund <- 1.0 +# +# # Validate and assign min_abundance if specified +# if (!is.null(min_abundance_str)) { +# temp_val <- as.numeric(min_abundance_str) +# if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { +# min_abund <- temp_val +# } else { +# stop("--min_abundance must be a proportional value between 0.0 and 1.0", +# call. = FALSE) +# } +# } +# +# # Validate and assign max_abundance if specified +# if (!is.null(max_abundance_str)) { +# temp_val <- as.numeric(max_abundance_str) +# if (!is.na(temp_val) && temp_val >= 0.0 && temp_val <= 1.0) { +# max_abund <- temp_val +# } else { +# stop("--max_abundance must be a proportional value between 0.0 and 1.0", +# call. = FALSE) +# } +# } +# +# # Check the relationship between min_abund and max_abund +# if (min_abund > max_abund) { +# stop("--min_abundance value must be less than or equal to --max_abundance value", +# call. = FALSE) +# } +# +# # Determine the parallel processing parameters +# if (is.null(cores_str)) { +# num_cores <- 1 +# } else if (cores_str == "-1") { +# num_cores <- detectCores() +# } else { +# num_cores <- as.integer(cores_str) +# if (is.na(num_cores) || num_cores <= 0) { +# stop("--cores argument must be a positive integer or '-1' for all cores", +# call. = FALSE) +# } +# } +# +# # Check if both --intree and --intable are provided +# if (is.null(inputTree) || is.null(inputProfile)) { +# stop( +# "Usage: RunEvolCCM.R --intree --intable [--compare_from value1,value2,...] [--compare_to valueA,valueB,...]", +# call. = FALSE +# ) +# } +# +# parsed_list <- list( +# "inputTree" = inputTree, +# "inputProfile" = inputProfile, +# "compare_from_vector" = compare_from_vector, +# "compare_to_vector" = compare_to_vector, +# "min_abund" = min_abund, +# "max_abund" = max_abund, +# "show_nans" = show_nans, +# "num_cores" = num_cores +# ) +# +# return(parsed_list) +# } +# +# ########################################################################## +# ### FUNCTION MODIFY_TREE +# ### Apply fixes, if necessary, to make the tree suitable for CCM analysis. +# ### tree: the tree +# ### fix_multifurcations, coerce_branches: self-explanatory Booleans +# ### RETURNS the corrected tree +# ########################################################################## +# +# modify_tree <- function(tree, +# fix_multifurcations, +# coerce_branches) { +# if (!is.rooted(tree)) { +# cat(noquote( +# "*** Input tree is not rooted! Applying midpoint rooting algorithm. ***\n" +# )) +# tree <- midpoint.root(tree) +# } +# +# ### Arbitrarily resolve multifurcations and zero-length branches ### +# if (!is.binary(tree)) { +# if (fix_multifurcations) { +# cat(noquote("*** Input tree is not binary! Forcing the issue. ***\n")) +# tree <- multi2di(tree) +# } else { +# stop( +# "Tree contains multifurcations and \"fix_multifurcations\" is set to FALSE. Exiting..." +# ) +# } +# } +# +# ### Coerce branches if necessary ### +# if (coerce_branches) { +# short_branch_count <- sum(tree$edge.length < min_branch_length) +# if (short_branch_count > 0) { +# notification_string = paste( +# short_branch_count, +# " of ", +# nrow(tree$edge), +# " edges are less than the specified threshold of ", +# min_branch_length, +# ".", +# sep = "" +# ) +# warning( +# paste( +# notification_string, +# "Setting these branches to equal the threshold; this will affect the results!" +# ) +# ) +# tree$edge.length[tree$edge.length < min_branch_length] <- +# min_branch_length +# } +# } +# return(tree) +# } +# +# ######################################################################################## +# ### FUNCTION CHECK_AND_FILTER_PROFILE +# ### Check for consistency between the profile and the tree, and apply filtering criteria +# ### aprofile: the phylogenetic profile +# ### tree: the phylogenetic tree +# ### min_abund: minimum abundance threshold +# ### max_abund: maximum abundance threshold +# ### RETURN a filtered profile, as long as nothing is broken +# ######################################################################################## +# +# check_and_filter_profile <- +# function(aprofile, tree, min_abund, max_abund) { +# # Get the leaf labels from the phylogenetic tree +# tree_labels <- tree$tip.label +# +# # Get the row labels from the dataframe +# df_labels <- rownames(aprofile) +# +# mismatch <- 0 +# # Find labels that are in the tree but not in the dataframe +# missing_in_df <- setdiff(tree_labels, df_labels) +# if (length(missing_in_df) > 0) { +# cat("Labels in the tree but not in the dataframe:", quote = FALSE) +# cat(missing_in_df) +# mismatch = 2 +# } +# +# # Find labels that are in the dataframe but not in the tree +# missing_in_tree <- setdiff(df_labels, tree_labels) +# if (length(missing_in_tree) > 0) { +# cat("Labels in the dataframe but not in the tree:", quote = FALSE) +# cat(missing_in_tree) +# mismatch = 1 +# } +# +# if (mismatch == 1) { +# stop("Exiting...") +# } +# +# cat("All names match between tree and matrix.\n") +# +# ### ABUNDANCE FILTER ### +# +# aprofile <- aprofile[tree$tip.label,] +# aprofile[aprofile > 1] <- 1 +# +# numFeatures <- dim(aprofile)[2] +# cat(paste("Total number of features: ", numFeatures, "\n")) +# +# ### Abundance filter: subset the dataframe to include only those columns that satisfy the proportion criteria ### +# proportion_ones <- +# apply(aprofile, 2, function(col) +# sum(col == 1) / length(col)) +# aprofile <- +# aprofile[, proportion_ones >= min_abund & +# proportion_ones <= max_abund] +# numFeatures <- dim(aprofile)[2] +# cat(paste( +# "Total number of features post abundance filtering: ", +# numFeatures, +# "\n" +# )) +# +# ### CLADE FILTERING: Remove any feature that maps perfectly onto a clade in the rooted phylogenetic tree ### +# # This function will recursively explore the tree and return the clades +# # Function to get all descendant tips of a node +# #if(clade_filter) { +# if (FALSE) { +# get_descendant_tips <- function(tree, node) { +# if (node <= length(tree$tip.label)) { +# return(tree$tip.label[node]) +# } else { +# descend <- which(tree$edge[, 1] == node) +# tips <- c() +# for (i in descend) { +# tips <- c(tips, get_descendant_tips(tree, tree$edge[i, 2])) +# } +# return(tips) +# } +# } +# +# # Get the clades for all internal nodes +# clades <- +# lapply((length(tree$tip.label) + 1):(length(tree$tip.label) + tree$Nnode), +# function(node) +# get_descendant_tips(tree, node)) +# +# # Function to check if a feature is unique to a clade +# check_feature <- function(feature) { +# for (clade in clades) { +# clade_data <- aprofile[unlist(clade), feature] +# outside_clade_data <- +# aprofile[setdiff(rownames(aprofile), unlist(clade)), feature] +# +# if (all(clade_data == 1, na.rm = TRUE) & +# all(outside_clade_data == 0, na.rm = TRUE)) { +# return(TRUE) +# } +# } +# return(FALSE) +# } +# +# for (feature in colnames(aprofile)) { +# if (check_feature(feature)) { +# filtered_data <- filtered_data %>% select(-feature) +# } +# } +# } +# return(aprofile) +# } +# +# #################################################################### +# ### FUNCTION PROCESS_PAIR +# ### Compute EvolCCM distance for a pair of profiles +# ### i,j: indices +# ### filtered_data: dataframe with phylogenetic profiles +# ### tree: the phylogenetic tree +# ### RETURNS a row containing CCM statistics +# #################################################################### +# +# process_pair <- function(i, j, filtered_data, tree) { +# res <- tryCatch({ +# #if ( +# if (j %% 50 == 0) { +# print(paste(i, "vs", j)) +# } +# +# # Run EstimateCCM for the current pair +# aE <- +# EstimateCCM(filtered_data[, c(i, j)], phytree = tree, trace = FALSE) +# estimatedRates <- aE$nlm.par +# +# # Retrieve the score and pvalue +# paE <- ProcessAE(aE) +# +# interact_score <- estimatedRates[5] / paE$hessianSE[5] +# interact_pval <- 2 * (1 - pnorm(abs(interact_score))) +# +# res <- c(aE$nlm.par, interact_score, interact_pval) +# +# # Return the result along with i, j, and column names +# list( +# i = i, +# j = j, +# colnames = colnames(filtered_data[, c(i, j)]), +# res = res +# ) +# #} +# }, error = function(e) { +# # Return NULL in case of an error +# NULL +# }) +# +# return(res) +# } +# +# ########################################################################### +# ### FUNCTION CONSTRUCTMATRIX +# ### Build square matrices that represent similarities/distances of profiles +# ### lines: the output lines generated by CCM +# ### outputFile: the output file +# ### column: the column in 'lines' to use for matrix construction +# ### RETURNS nothing, WRITES the resulting matrix to a file +# ########################################################################## +# +# constructMatrix <- function(lines, outputFile, column) { +# # Initialize an empty list to store the entities +# entities <- list() +# +# for (line in lines) { +# # Parse the line into fields +# fields <- strsplit(line, "\t")[[1]] +# # Extract the entities +# entity1 <- fields[1] +# entity2 <- fields[2] +# +# # If the entities are new, add them to the list +# if (!entity1 %in% entities) { +# entities <- c(entities, entity1) +# } +# if (!entity2 %in% entities) { +# entities <- c(entities, entity2) +# } +# } +# +# # Initialize an empty matrix to store the distances +# distances <- +# matrix(nrow = length(entities), ncol = length(entities)) +# +# for (line in lines) { +# # Parse the line into fields +# fields <- strsplit(line, "\t")[[1]] +# # Extract the entities and the distance +# entity1 <- fields[1] +# entity2 <- fields[2] +# distance <- as.numeric(fields[column]) +# +# # Add the distance to the matrix +# distances[entities == entity1, entities == entity2] <- distance +# distances[entities == entity2, entities == entity1] <- distance +# } +# +# # Convert the matrix to a data frame and set the row and column names +# distances <- as.data.frame(distances) +# rownames(distances) <- entities +# colnames(distances) <- entities +# +# # Write the matrix to a file +# write.table(distances, +# file = outputFile, +# sep = "\t", +# quote = FALSE) +# +# } +# +# #################### +# ### MAIN PROGRAM ### +# #################### +# +# +# ########################################## FILTERS ########################################## +# +# ### Remove any profile that maps perfectly onto a clade in the tree. In practice this doesn't do much ### +# clade_filter = FALSE +# +# ### Apply pairwise heuristic based on some criterion: do not perform full comparison if heuristic doesn't satisfy the threshold ### +# ### (CURRENTLY NOT IMPLEMENTED: mutual information and Euclidean were bad, UniFrac might be worth a try) ### +# heuristic_filter = FALSE +# heuristic_threshold = 0.001 +# +# ### Coerce branches to a minimum threshold length (ALTERNATIVE: multiply all branch lengths by the minimum amount needed to get to threshold) +# coerce_branches = FALSE +# #min_branch_length = 0.001 +# +# ### Coerce multifurcations to bifurcations (alternative is to stop execution) +# fix_multifurcations = TRUE +# +# ######################################## PARSE CL ARGUMENTS ######################################## +# +# args <- commandArgs(trailingOnly = TRUE) +# parsed <- parse_args(args) +# +# ### Output file name (could be changed to a command-line argument) ### +# outputTree <- +# paste("EvolCCM_", basename(parsed$inputTree), sep = "") +# outputFile <- +# paste("EvolCCM_", basename(parsed$inputProfile), sep = "") +# +# ###################### READ THE TREE FILE AND FIX IF NECESSARY ###################### +# +# cat("\nReading tree...\n") +# tree <- read.tree(parsed$inputTree) +# tree <- modify_tree(tree, fix_multifurcations, coerce_branches) +# write.tree(tree, outputTree) +# +# ###################### READ THE PROFILE AND APPLY FILTERS ###################### +# +# ### Read the profile file ### +# +# cat("\nReading profile...\n") +# aprofile <- read.csv(parsed$inputProfile, row.names = 1, sep = "\t") +# filtered_data <- +# check_and_filter_profile(aprofile, tree, parsed$min_abund, parsed$max_abund) +# numFeatures <- dim(filtered_data)[2] +# +# ######################## PARALLELIZED NESTED LOOP FOR PAIRWISE PROFILE COMPARISONS ######################## +# +# cat("\nInitiating comparisons...\n") +# registerDoParallel(cores = parsed$num_cores) +# +# compare_from_vector <- parsed$compare_from_vector +# compare_to_vector <- parsed$compare_to_vector +# +# results <- +# foreach( +# i = 1:(numFeatures - 1), +# .combine = 'c', +# .multicombine = TRUE +# ) %dopar% { +# i_name <- colnames(filtered_data)[i] +# +# # If compare_from_str is defined, then check the condition. If not, then set to TRUE by default. +# i_condition <- ifelse(!is.null(compare_from_vector), +# any(sapply(compare_from_vector, function(x) { +# startsWith(i_name, x) +# })), +# TRUE) +# +# if (i_condition) { +# print(paste("Processing feature #", i, " (", i_name, ")", sep = ""), +# quote = FALSE) +# +# sapply((i + 1):numFeatures, function(j) { +# j_name <- colnames(filtered_data)[j] +# +# # If compare_to_str is defined, then check the condition. If not, then set to TRUE by default. +# j_condition <- ifelse(!is.null(compare_to_vector), +# any(sapply(compare_to_vector, function(x) { +# startsWith(j_name, x) +# })), +# TRUE) +# +# if (j_condition) { +# return(process_pair(i, j, filtered_data, tree)) +# } else { +# return(NULL) +# } +# }, simplify = FALSE) +# } else { +# return(NULL) +# } +# } +# +# cat("\nProfile comparisons complete!\n") +# +# ################################# OUTPUT ##################################### +# +# cat("\nWriting output...\n") +# +# ### Process the results and write them to chunks of output files ### +# chunk_size <- 100 +# output_file_pattern <- "output_temp_chunk_%d.txt" +# chunk_index <- 1 +# result_counter <- 1 +# +# for (result in results) { +# if (!is.null(result)) { +# if ((result_counter - 1) %% chunk_size == 0) { +# temp_output_file <- sprintf(output_file_pattern, chunk_index) +# chunk_index <- chunk_index + 1 +# } +# ### Fix NaNs unless user has specified otherwise +# if (parsed$show_nans != 1) { +# if (is.nan(result$res[6])) { +# # print(result$res) +# # print("Fixing NaN") +# result$res[6] = 0 +# result$res[7] = 1 +# # print(result$res) +# } +# } +# cat(paste(result$colnames, collapse = "\t"), +# file = temp_output_file, +# append = TRUE) +# cat("\t", file = temp_output_file, append = TRUE) +# cat(paste(result$res, collapse = "\t"), +# file = temp_output_file, +# append = TRUE) +# cat("\n", file = temp_output_file, append = TRUE) +# +# result_counter <- result_counter + 1 +# } +# } +# +# # Merge the chunks of output files into the main output file +# temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt") +# +# outputHeadings <- +# paste( +# "feature1", +# "feature2", +# "intrisic1", +# "intrisic2", +# "gainloss1", +# "gainloss2", +# "interaction", +# "interact_score", +# "interact_pval", +# sep = "\t" +# ) +# outputFilegz = gzfile(outputFile, "w") +# cat(outputHeadings, file = outputFilegz, sep = "\n") +# for (temp_file in temp_files) { +# # Read the contents of the temporary file +# temp_contents <- readLines(temp_file) +# +# # Write the contents to the main output file +# cat(temp_contents, +# file = outputFilegz, +# append = TRUE, +# sep = "\n") +# +# # Remove the temporary file +# file.remove(temp_file) +# } +# +# # Read the lines from the input file +# close(outputFilegz) +# lines <- readLines(outputFile)[-1] +# X2file = paste(outputFile, "X2", sep = ".") +# Pfile = paste(outputFile, "pvals", sep = ".") +# +# constructMatrix(lines, X2file, 8) +# constructMatrix(lines, Pfile, 9) +# +# cat("ParallelEvolCCM run done.\n\n")