diff --git a/.Rbuildignore b/.Rbuildignore index d7259c99..61669d35 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,6 @@ ^README\.Rmd$ ^\.github$ ^LICENSE\.md$ +^\.semquery$ +^\.active_plans$ +^\.agents_context$ diff --git a/NAMESPACE b/NAMESPACE index b30ff979..7d9fc850 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,10 @@ export(detectBatchEffects) export(detectOutlierSamples) export(detect_probetype) export(double_center_scale_fast) +export(drugs.analysisInfo) +export(drugs.enrichmentTable) +export(drugs.moaEnrichment) +export(drugs.supportBucket) export(eset.getPhenoData) export(eset.parsePhenoFromTitle) export(estimateBatchCorrectionVectors) @@ -388,7 +392,6 @@ export(pgx.createSignatureDatabaseH5) export(pgx.createSignatureDatabaseH5.fromMatrix) export(pgx.createSingleCellPGX) export(pgx.createVipGeneLayer) -export(pgx.create_reports) export(pgx.crosscheckINPUT) export(pgx.deconvolution) export(pgx.dimPlot) @@ -434,11 +437,14 @@ export(pgx.getPCSFcentrality) export(pgx.getSigGO) export(pgx.getSymbolFromFeatureData) export(pgx.getTopDrugs) +export(pgx.getTopGS) export(pgx.getTopGeneSets) export(pgx.getTopGenes) +export(pgx.getTopMOA) export(pgx.inferCellCyclePhase) export(pgx.inferCellType) export(pgx.inferGender) +export(pgx.info) export(pgx.initialize) export(pgx.justSeuratObject) export(pgx.load) @@ -489,6 +495,8 @@ export(pgx.testPhenoCorrelation) export(pgx.testPhenoCorrelation2) export(pgx.testTCGAsurvival) export(pgx.update_drugs_results) +export(pgx.update_infographics) +export(pgx.update_reports) export(pgx.variableImportance) export(pgx.violinPlot) export(pgx.wgcna) @@ -558,9 +566,9 @@ export(rownames.data.table) export(rowscale) export(rowsduplicated.dgCMatrix) export(rpt.compile_cmap_report) -export(rpt.compile_sections) export(rpt.compile_wgcna_report) export(rpt.create_full_report) +export(rpt.create_summary_report) export(run.GSEA) export(run.GSEA.LeadingEdge) export(run.GSEA.preranked) @@ -633,6 +641,7 @@ export(wgcna.create_infographic) export(wgcna.create_module_infographic) export(wgcna.create_report) export(wgcna.describeModules) +export(wgcna.ensureStats) export(wgcna.filterColors) export(wgcna.getConsensusGeneStats) export(wgcna.getConsensusTopGenesAndSets) @@ -641,6 +650,7 @@ export(wgcna.getModuleCrossGenes) export(wgcna.getMultiTopGenesAndSets) export(wgcna.getTopGenesAndSets) export(wgcna.getTopModules) +export(wgcna.get_modTraits) export(wgcna.init) export(wgcna.matchColors) export(wgcna.pickSoftThreshold) diff --git a/R/compute2-extra.R b/R/compute2-extra.R index d7f26e4c..30c12882 100644 --- a/R/compute2-extra.R +++ b/R/compute2-extra.R @@ -17,7 +17,7 @@ #' @export compute_extra <- function(pgx, extra = c( "meta.go", "infer", "deconv", "drugs", ## "graph", - "connectivity", "wordcloud", "wgcna", "mofa" + "connectivity", "wordcloud", "wgcna", "wgcna_mox", "mofa" ), sigdb = NULL, pgx.dir = "./data", libx.dir = "./libx", user_input_dir = getwd(), llm_model = NULL, img_model = NULL @@ -253,6 +253,41 @@ compute_extra <- function(pgx, extra = c( timings <- rbind(timings, c("wgcna", tt)) } + if ("wgcna_mox" %in% extra && isTRUE(pgx$datatype %in% c("multi-omics", "multiomics"))) { + info("[compute_extra] Computing multi-omics WGCNA...") + tt <- system.time({ + tryCatch( + { + pgx$wgcna_mox <- wgcna.compute_multiomics( + dataX = mofa.split_data(pgx$X), + samples = pgx$samples, + contrasts = pgx$contrasts, + annot = pgx$genes, + GMT = pgx$GMT, + experiment = pgx$description, + power = NULL, + ngenes = 2000, + deepsplit = 2, + minmodsize = 10, + minKME = 0.3, + compute.enrichment = TRUE, + add.gsets = FALSE, + add.pheno = FALSE, + do.consensus = FALSE, + report = FALSE + ) + }, + error = function(e) { + message("[ERROR_WGCNA_MOX] FATAL: ", as.character(e)) + write(as.character(e), file = paste0(user_input_dir, "/ERROR_WGCNA_MOX")) + return(NULL) + } + ) + }) + timings <- rbind(timings, c("wgcna_mox", tt)) + info("[compute_extra] multi-omics WGCNA done") + } + if ("mofa" %in% extra) { message(">>> Computing MOFA...") tt <- system.time({ diff --git a/R/pgx-annot.R b/R/pgx-annot.R index e64d03aa..dca9409f 100644 --- a/R/pgx-annot.R +++ b/R/pgx-annot.R @@ -1321,6 +1321,10 @@ getHumanOrtholog.biomart <- function(organism, symbols, verbose = 1) { #' @export probe2symbol <- function(probes, annot_table, query = "symbol", key = NULL, fill_na = FALSE, add_datatype = FALSE) { + + # NULL annot_table: no mapping possible — return probes as-is (same behaviour as fill_na=TRUE) + if (is.null(annot_table)) return(probes) + # Prepare inputs. add extra matching columns. annot_table <- cbind(rownames = rownames(annot_table), annot_table) id.cols <- intersect(c("feature", "gene_name", "symbol"), colnames(annot_table)) diff --git a/R/pgx-drugs-report.R b/R/pgx-drugs-report.R new file mode 100644 index 00000000..e0b728ba --- /dev/null +++ b/R/pgx-drugs-report.R @@ -0,0 +1,263 @@ +## +## This file is part of the Omics Playground project. +## Copyright (c) 2018-2026 BigOmics Analytics SA. All rights reserved. +## + +## ============================================================================ +## Drug Connectivity Report Utilities +## ---------------------------------------------------------------------------- +## Lightweight extractors and tier classifiers consumed by the drug +## connectivity AI report (omicsplayground board.drugconnectivity) and +## by any future board or pipeline that needs to summarise drug +## enrichment results. +## +## The functions in this file are intentionally small and pure: they +## operate on pre-computed pgx$drugs[[method]] matrices and return +## data frames or lists, with no Shiny or LLM dependencies. +## ============================================================================ + +## Internal: split pipe / semicolon / comma separated annotation tokens. +## L1000 / CMap annotations sometimes ship mixed-encoding bytes; iconv with +## sub = "" silently drops invalid sequences so strsplit's regex engine +## can run without "input string is invalid" / "unable to translate" warnings. +.drugs_parseTokens <- function(x) { + x <- as.character(x %||% "") + x[is.na(x)] <- "" + x <- iconv(x, from = "UTF-8", to = "UTF-8", sub = "") + x[is.na(x)] <- "" + lapply(x, function(s) trimws(strsplit(s, split = "[\\|;,]")[[1]])) +} + + +#' @title NES direction semantics for a drug connectivity analysis type +#' +#' @param analysis_type Character. Name of the analysis backing the drug +#' enrichment result, e.g. \code{"L1000/activity"}, \code{"L1000/gene"}, +#' \code{"CTRPv2/sensitivity"}, \code{"GDSC/sensitivity"}. +#' +#' @return A list with two character scalars: +#' \describe{ +#' \item{\code{analysis_type}}{the input value (verbatim, or \code{"unknown"}).} +#' \item{\code{analysis_type_description}}{a paragraph describing the +#' data source and the direction semantics of NES for that backend.} +#' } +#' +#' @description Returns a deterministic, human-readable description of a +#' drug connectivity analysis type, including the meaning of positive vs. +#' negative NES. L1000-based analyses interpret negative NES as opposition +#' (reversal candidate); pharmacological sensitivity analyses (CTRPv2, +#' GDSC) interpret positive NES as predicted drug vulnerability. +#' +#' @details The returned description is intended for inclusion in AI report +#' prompts so the language model interprets NES sign correctly for the +#' selected backend. Matching is case-insensitive. +#' +#' @export +drugs.analysisInfo <- function(analysis_type) { + key <- tolower(analysis_type %||% "") + + desc <- if (key %in% c("l1000/activity", "l1000_activitys_n20d1011")) { + paste( + "L1000 Transcriptional Activity Scores (TAS) connectivity.", + "Each drug signature is weighted by strength and reproducibility across cell lines.", + "Negative NES = drug opposes the experimental state (reversal candidate);", + "positive NES = drug mimics it (shared transcriptional programme)." + ) + } else if (key == "l1000/gene") { + paste( + "L1000 gene-level log-fold-change connectivity (978 landmark genes, per-cell-line).", + "More granular than TAS but noisier; best used to corroborate TAS findings.", + "Negative NES = drug opposes the experimental state;", + "positive NES = drug mimics it." + ) + } else if (key == "ctrpv2/sensitivity") { + paste( + "CTRPv2 pharmacological sensitivity connectivity (AUC dose-response,", + "~481 compounds, ~860 cancer cell lines).", + "NES reflects co-variation of drug sensitivity with the experimental signature.", + "Positive NES = the experimental state predicts drug vulnerability (sensitivity);", + "negative NES = resistance or insensitivity." + ) + } else if (key == "gdsc/sensitivity") { + paste( + "GDSC pharmacogenomic sensitivity connectivity (IC50-based,", + "~367 clinical anti-cancer drugs, ~987 cell lines).", + "Strongest clinical relevance due to approved and late-stage compounds.", + "Positive NES = the experimental state predicts drug sensitivity;", + "negative NES = predicted resistance." + ) + } else { + paste( + "Drug connectivity analysis based on the selected pre-computed signature resource.", + "Interpret results according to the selected analysis type metadata." + ) + } + + list( + analysis_type = analysis_type %||% "unknown", + analysis_type_description = desc + ) +} + + +#' @title Build a drug enrichment (DSEA) table for one contrast/method +#' +#' @param pgx A pgx object holding pre-computed drug enrichment matrices in +#' \code{pgx$drugs[[method]]} (X = NES, P = pvalue, Q = qvalue, annot). +#' @param method Character. Name of the entry in \code{pgx$drugs} to read. +#' @param contrast Character. Contrast name (column of \code{pgx$drugs[[method]]$X}). +#' @param annot Optional data.frame of drug annotations with at least the +#' columns \code{moa} and \code{target} and row names matching drug names. +#' If \code{NULL}, the function falls back to +#' \code{pgx$drugs[[method]]$annot}. +#' @param only_annotated Logical. If \code{TRUE}, drop drugs that have empty +#' MOA and target after the join. +#' +#' @return A data.frame ordered by descending \code{|NES|}, with columns +#' \code{drug}, \code{NES}, \code{pval}, \code{padj}, \code{moa}, \code{target}. +#' Returns \code{NULL} if the inputs are missing or the contrast does not +#' exist. +#' +#' @description Extracts a single-contrast drug enrichment table from +#' pre-computed \code{pgx$drugs[[method]]} matrices, joining drug-level +#' MOA and target annotations when available. This is the input +#' expected by \code{\link{drugs.moaEnrichment}}. +#' +#' @details Compared to \code{\link{pgx.getDrugConnectivityTable}}, this +#' function accepts an explicit \code{annot} argument so callers can +#' supply annotations from an external source when +#' \code{pgx$drugs[[method]]$annot} is missing. NES/pval/padj are +#' rounded to four digits; missing NES are coerced to 0 and missing +#' p/q-values to 1. +#' +#' @export +drugs.enrichmentTable <- function(pgx, method, contrast, + annot = NULL, only_annotated = FALSE) { + if (is.null(pgx$drugs) || is.null(method) || !nzchar(method)) return(NULL) + dr <- pgx$drugs[[method]] + if (is.null(dr) || is.null(contrast) || !contrast %in% colnames(dr$X)) return(NULL) + + nes <- round(dr$X[, contrast], 4) + pv <- round(dr$P[, contrast], 4) + qv <- round(dr$Q[, contrast], 4) + drug <- rownames(dr$X) + + if (is.null(annot)) annot <- dr$annot + + nes[is.na(nes)] <- 0 + qv[is.na(qv)] <- 1 + pv[is.na(pv)] <- 1 + + if (is.null(annot)) { + dt <- data.frame( + drug = drug, NES = nes, pval = pv, padj = qv, + moa = "", target = "", + stringsAsFactors = FALSE + ) + } else { + jj <- match(toupper(drug), toupper(rownames(annot))) + moa_col <- if ("moa" %in% colnames(annot)) "moa" else NA_character_ + target_col <- if ("target" %in% colnames(annot)) "target" else NA_character_ + aa <- data.frame( + moa = if (!is.na(moa_col)) annot[jj, moa_col] else "", + target = if (!is.na(target_col)) annot[jj, target_col] else "", + stringsAsFactors = FALSE + ) + dt <- data.frame( + drug = drug, NES = nes, pval = pv, padj = qv, aa, + stringsAsFactors = FALSE + ) + } + + dt <- dt[order(-abs(dt$NES)), , drop = FALSE] + rownames(dt) <- dt$drug + + if (isTRUE(only_annotated)) { + sel <- which((dt$moa %||% "") != "" | (dt$target %||% "") != "") + dt <- dt[sel, , drop = FALSE] + } + + dt +} + + +#' @title On-the-fly MOA / target enrichment via fgsea +#' +#' @param dsea_table A data.frame produced by \code{\link{drugs.enrichmentTable}}, +#' with at least the columns \code{NES}, \code{moa}, \code{target} and +#' drug names as row names. +#' @param field One of \code{"moa"} or \code{"target"}. Selects the +#' annotation column to tokenise into pseudo gene sets. +#' @param nperm Integer. Number of fgsea permutations. +#' +#' @return A data.frame of fgsea results (\code{pathway}, \code{NES}, +#' \code{pval}, \code{padj}, \code{size}, ...) ordered by descending +#' \code{|NES|}. Returns \code{NULL} if the input is empty or no +#' tokens are found. +#' +#' @description Computes MOA-level or target-level enrichment by treating +#' each annotation token as a pseudo gene set over the drug-ranked NES +#' vector. Unlike \code{\link{pgx.getDrugMOATable}}, which requires +#' \code{pgx$drugs[[method]]$moa} to be pre-populated, this function +#' recomputes the enrichment on demand from the drug-level table. +#' +#' @details Tokens are extracted from the selected annotation column by +#' splitting on \code{|}, \code{;} or \code{,}. Empty / \code{NA} / +#' \code{"N/A"} tokens are dropped. Each unique token defines a gene set +#' (the set of drugs annotated with that token), and fgsea ranks them +#' against the NES vector. Useful for AI report generation where the +#' board only has the drug-level table and needs MOA-level signal on the +#' fly. +#' +#' @export +drugs.moaEnrichment <- function(dsea_table, field = c("moa", "target"), + nperm = 5000) { + field <- match.arg(field) + if (is.null(dsea_table) || !is.data.frame(dsea_table) || nrow(dsea_table) == 0) { + return(NULL) + } + + terms.list <- .drugs_parseTokens(dsea_table[[field]]) + names(terms.list) <- rownames(dsea_table) + terms <- setdiff(unique(unlist(terms.list)), c(NA, "", " ", "NA", "N/A")) + if (length(terms) == 0) return(NULL) + + gmt <- lapply(terms, function(g) { + names(which(sapply(terms.list, function(t) g %in% t))) + }) + names(gmt) <- terms + + rnk <- dsea_table$NES + names(rnk) <- rownames(dsea_table) + + res <- suppressWarnings(tryCatch( + fgsea::fgsea(gmt, rnk, nperm = nperm), + error = function(e) NULL + )) + if (is.null(res) || !is.data.frame(res) || nrow(res) == 0) return(NULL) + + res[order(-abs(res$NES)), , drop = FALSE] +} + + +#' @title Classify a result by significance support bucket +#' +#' @param padj Numeric. Adjusted p-value (q-value) of one result. +#' @param pval Numeric. Raw p-value of the same result. Used as a +#' fallback when \code{padj} is \code{NA} or above 0.05. +#' +#' @return A character scalar: \code{"significant"} when \code{padj < 0.05}, +#' \code{"nominal"} when only \code{pval < 0.05}, otherwise +#' \code{"unsupported"}. +#' +#' @description Tier classifier used to gate evidence buckets in AI report +#' generation: significant results drive headline claims, nominal +#' results are mentioned with hedging, unsupported results are +#' suppressed. +#' +#' @export +drugs.supportBucket <- function(padj, pval = NA_real_) { + if (!is.na(padj) && padj < 0.05) return("significant") + if (!is.na(pval) && pval < 0.05) return("nominal") + "unsupported" +} diff --git a/R/pgx-wgcna-report.R b/R/pgx-wgcna-report.R new file mode 100644 index 00000000..d225b7b4 --- /dev/null +++ b/R/pgx-wgcna-report.R @@ -0,0 +1,566 @@ +## Copyright (c) 2018-2026 BigOmics Analytics SA. All rights reserved. + +# ============================================================================= +# WGCNA AI-report extraction module +# ============================================================================= +# Helpers consumed by the omicsplayground board.wgcna AI-report layer plus +# multi-omics / consensus variants. This module owns all "describe the +# computed WGCNA result" logic; pgx-wgcna.R retains only the compute side +# (wgcna.compute, wgcna.computeGeneStats, plot helpers, etc). +# +# Migration history (epic playbase-fad): +# - Original symbols lived in pgx-wgcna.R lines 5599-5989 alongside the +# compute code. Moved here verbatim in the consolidation step. +# - `wgcna.selectTopModules` from the initial duplication step has been +# absorbed into the new `wgcna.getTopModules` (with `min_modules` floor). + +# ----------------------------------------------------------------------------- +# Module-trait correlation +# ----------------------------------------------------------------------------- + +#' Module-trait correlation matrix +#' +#' Returns the cached `wgcna$modTraits` matrix when present, otherwise computes +#' the Pearson correlation between module eigengenes (`wgcna$net$MEs`) and the +#' trait matrix (`wgcna$datTraits`). NA cells are filled with zero. +#' +#' @param wgcna WGCNA result object. +#' @return Numeric matrix (modules × traits). +#' @export +wgcna.get_modTraits <- function(wgcna) { + if(!is.null(wgcna$modTraits)) { + M <- wgcna$modTraits + } else { + M <- cor( wgcna$net$MEs, wgcna$datTraits, use="pairwise") + } + M[is.na(M)] <- 0 + return(M) +} + + +# ----------------------------------------------------------------------------- +# Top genes / sets / phenotypes (single-omics, multi-omics, consensus) +# ----------------------------------------------------------------------------- + +#' Top genes, sets, and phenotypes per module +#' +#' Dispatch is shape-based: consensus objects (with `$layers` and a list-valued +#' `$datExpr`) go to `wgcna.getConsensusTopGenesAndSets()`; multi-omics objects +#' (with `$layers` but no `$datExpr`) go to `wgcna.getMultiTopGenesAndSets()`; +#' everything else is treated as single-omics. +#' +#' @param wgcna WGCNA result object. +#' @param annot Annotation table; optional but recommended. +#' @param module Optional module subset. +#' @param ntop Integer; top N per module. +#' @param psig Numeric; p-value cutoff for moduleMembership significance. +#' @param level "gene" or "geneset". +#' @param rename Annotation column to use when renaming features. +#' @return list(sets, genes, pheno, neg.pheno). +#' @export +wgcna.getTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, + psig = 0.05, level="gene", rename="symbol") { + + is.consensus <- "layers" %in% names(wgcna) && class(wgcna$datExpr) == "list" + is.multi <- "layers" %in% names(wgcna) && is.null(wgcna$datExpr) + + if(is.consensus) { + top <- wgcna.getConsensusTopGenesAndSets(wgcna, annot=annot, + module=module, ntop=ntop, rename=rename) + return(top) + } + + if(is.multi) { + top <- wgcna.getMultiTopGenesAndSets( + wgcna$layers, annot=annot, module=module, psig=psig, ntop=ntop, + level=level, rename=rename) + return(top) + } + + stats <- NULL + if(!"stats" %in% names(wgcna)) { + stats <- wgcna.computeGeneStats(wgcna$net, wgcna$datExpr, wgcna$datTraits, + wgcna$svTOM) + } else { + stats <- wgcna$stats + } + if(!any(c("gse","gsea") %in% names(wgcna))) { + warning("object has no enrichment results (gsea)") + } + + ## get top genes by centrality-weighted-meanFC2 + mm <- stats$moduleMembership + mm.sig <- 1*(stats$MMPvalue <= psig) + ff <- sqrt(rowMeans(stats$foldChange**2, na.rm=TRUE)) + mm <- mm * mm.sig * ff + if(!is.null(annot)) { + annot$gene_title <- paste0(annot$gene_title," (",annot$symbol,")") + mm <- rename_by2(mm, annot, new_id=rename) + } + gg <- rownames(mm) + mm <- as.list(data.frame(mm)) + if(!is.null(module)) mm <- mm[which(names(mm) %in% module)] + for(i in 1:length(mm)) names(mm[[i]]) <- gg + mm <- lapply(mm, function(x) x[x!=0]) + topgenes <- lapply(mm, function(x) names(head(sort(-x),ntop))) + + ## top genesets + topsets <- NULL + if(any(c("gse","gsea") %in% names(wgcna))) { + if(!is.null(wgcna$gsea)) ee <- wgcna$gsea + if(!is.null(wgcna$gse)) ee <- wgcna$gse + if(!is.null(module)) ee <- ee[which(names(ee) %in% module)] + topsets <- lapply(ee,function(x) head(rownames(x),ntop)) + } + + ## top correlated phenotypes + M <- wgcna.get_modTraits(wgcna) + top.pheno <- apply(M, 1, function(x) names(which(x > 0.8*max(x, na.rm=TRUE)))) + top.negpheno <- apply(M, 1, function(x) names(which(x < 0.8*min(x, na.rm=TRUE)))) + + if(level=="geneset") { + topsets <- topgenes + topgenes <- NULL + } + + list(sets = topsets, genes = topgenes, pheno = top.pheno, neg.pheno = top.negpheno) +} + +#' Top genes/sets/phenotypes for a multi-omics WGCNA object (layered). +#' @export +wgcna.getMultiTopGenesAndSets <- function(multi_wgcna, annot=NULL, module=NULL, + psig=0.05, ntop=40, level=NULL, + rename="symbol") { + + if("layers" %in% names(multi_wgcna)) { + multi_wgcna <- multi_wgcna$layers + } + + ## set level + nw <- length(multi_wgcna) + if(!is.null(level)) { + level <- head(rep(level, nw),nw) + } else { + level <- c("gene","geneset")[1 + 1*grepl("^gs|^gset|geneset",names(multi_wgcna))] + } + names(level) <- names(multi_wgcna) + + toplist <- list() + k <- names(multi_wgcna)[1] + for (k in names(multi_wgcna)) { + topk <- wgcna.getTopGenesAndSets( + multi_wgcna[[k]], module=module, annot=annot, + ntop=ntop, psig=psig, level=level[[k]], rename=rename) + if(!is.null(module)) { + topk <- lapply( topk, function(s) s[which(names(s) %in% module)] ) + } + toplist[[k]] <- topk + } + + top <- list() + top$genes <- lapply(toplist, function(t) t[['genes']]) + names(top$genes) <- NULL + top$genes <- unlist(top$genes, recursive = FALSE) + + top$sets <- lapply(toplist, function(t) t[["sets"]]) + names(top$sets) <- NULL + top$sets <- unlist(top$sets, recursive = FALSE) + + top$pheno <- lapply(toplist, function(t) t[["pheno"]]) + names(top$pheno) <- NULL + top$pheno <- unlist(top$pheno, recursive = FALSE) + + top$neg.pheno <- lapply(toplist, function(t) t[["neg.pheno"]]) + names(top$neg.pheno) <- NULL + top$neg.pheno <- unlist(top$neg.pheno, recursive = FALSE) + + return(top) +} + + +#' Top genes/sets/phenotypes for a consensus WGCNA object. +#' @export +wgcna.getConsensusTopGenesAndSets <- function(cons, annot=NULL, module=NULL, ntop=40, + level=c("gene","geneset")[1], + rename="symbol" ) { + if(!"stats" %in% names(cons)) stop("object has no stats") + if(!any(c("gse","gsea") %in% names(cons))) { + warning("object has no enrichment results (gsea)") + } + + if(!is.null(annot)) { + annot$gene_title <- paste0(annot$gene_title," (",annot$symbol,")") + } + + ## get top genes (highest kME) + topgenesx <- list() + for(i in 1:length(cons$stats)) { + mm <- cons$stats[[i]]$moduleMembership + if(!is.null(annot)) { + mm <- rename_by2(mm, annot, rename) + } + gg <- rownames(mm) + mm <- as.list(data.frame(mm)) + if (!is.null(module)) mm <- mm[module] + sel.topgenes <- lapply(mm, function(x) head(order(-x), 3 * ntop)) + topgenesx[[i]] <- lapply(sel.topgenes, function(i) gg[i]) + } + + ## intersect topgenes across all datatypes + topgenes <- topgenesx[[1]] + k <- 2 + for (k in 2:length(topgenesx)) { + topgenes <- mapply(intersect, topgenes, topgenesx[[k]], SIMPLIFY = FALSE) + } + topgenes <- lapply(topgenes, head, ntop) + + if(!is.null(module)) { + sel <- intersect(names(topgenes),module) + topgenes <- topgenes[sel] + } + + ## top genesets (as symbol!) + topsets <- NULL + if(any(c("gse","gsea") %in% names(cons))) { + if(!is.null(cons$gsea)) ee <- cons$gsea + if(!is.null(cons$gse)) ee <- cons$gse + ee <- ee[match(names(topgenes),names(ee))] + names(ee) <- names(topgenes) + topsets <- lapply(ee,function(x) head(rownames(x),ntop)) + } + + ## module traits + M <- lapply(cons$net$multiMEs, function(x) as.matrix(x$data)) + Y <- lapply(M, function(m) cons$datTraits[rownames(m),]) + R <- mapply( function(x,y) abs(cor(x,y,use="pairwise")), M, Y, SIMPLIFY=FALSE) + R <- Reduce('+', R) + top.pheno <- apply(R, 1, function(x) names(which(x > 0.8*max(x,na.rm=TRUE))), + simplify = FALSE) + top.negpheno <- apply(R, 1, function(x) names(which(x < 0.8*min(x,na.rm=TRUE))), + simplify = FALSE) + + if (level == "geneset") { + topsets <- topgenes + topgenes <- NULL + } + + list(sets = topsets, genes = topgenes, pheno = top.pheno, neg.pheno = top.negpheno) +} + + +# ----------------------------------------------------------------------------- +# Module selection (with minimum-count floor) +# ----------------------------------------------------------------------------- + +#' Select top modules with a minimum-count floor +#' +#' A module qualifies if (a) it has at least one trait p-value below `psig`, or +#' (b) its row in the module-trait correlation matrix has at least one column +#' meeting `topratio * max(R)` (clipped at `minrho`). +#' +#' If the resulting selection has fewer than `min_modules` non-grey modules, +#' the selection is padded with the next-strongest non-grey modules ranked by +#' `rowMeans(abs(R))`. If the input has `<= min_modules` non-grey modules in +#' total, all non-grey modules are returned unfiltered. +#' +#' Replaces the older `wgcna.getTopModules()` variant whose `kx` argument was +#' deprecated; the new signature drops `kx` entirely and adds `min_modules`. +#' +#' @param wgcna WGCNA result object (single-omics or layered). +#' @param min_modules Integer; minimum number of modules to return. +#' @param topratio Numeric. +#' @param psig Numeric. +#' @param minrho Numeric. +#' @param rm.grey Logical; drop grey from the selection. +#' @param multi Logical or NULL; force multi-layer dispatch. +#' @return Character vector of module names. +#' @export +wgcna.getTopModules <- function(wgcna, min_modules = 5L, + topratio = 0.85, psig = 0.05, + minrho = 0.1, rm.grey = TRUE, + multi = NULL) { + + ## --- Resolve the candidate non-grey module pool from me.genes ------------- + ## Multi-layer objects may not carry me.genes at the top level; skip the + ## bypass/floor in that case and let the per-layer selection below drive. + raw <- names(wgcna$me.genes) + non_grey <- character(0) + if (length(raw) > 0) { + suf <- suppressWarnings(as.integer(sub("^ME", "", raw))) + display <- raw + if (length(suf) > 0 && all(!is.na(suf))) { + display <- paste0("ME", WGCNA::labels2colors(suf)) + } + is_grey <- raw %in% c("MEgrey", "grey") | display %in% c("MEgrey", "grey") + non_grey <- raw[!is_grey] + + ## Bypass: too few non-grey modules to filter meaningfully — return all. + if (length(non_grey) <= min_modules) { + return(non_grey) + } + } + + ## --- Primary selection (significance + topratio gating, per-layer) -------- + if (is.null(topratio)) topratio <- 0.85 + if (is.null(multi) && !is.null(wgcna$layers)) multi <- TRUE + if (is.null(multi)) multi <- FALSE + if (!multi) { + ww <- list(gx = wgcna) + } else if (!is.null(wgcna$layers)) { + ww <- wgcna$layers + } else { + ww <- wgcna + } + + R <- list() + P <- list() + for (i in seq_along(ww)) { + me <- ww[[i]]$net$MEs + dt <- ww[[i]]$datTraits + R1 <- cor(me, dt, use = "pairwise") + ndim <- colSums(!is.na(dt)) + P1 <- sapply(seq_len(ncol(dt)), function(j) cor.pvalue(R1[, j], ndim[j])) + colnames(P1) <- colnames(dt) + R[[i]] <- R1 + P[[i]] <- P1 + } + + selected <- c() + for (i in seq_along(R)) { + idx1 <- which(rowSums(P[[i]] <= psig) > 0) + rmax <- topratio * pmax(apply(R[[i]], 2, max, na.rm = TRUE), 0) + rmax <- pmax(rmax, minrho) + idx2 <- which(colSums(t(R[[i]]) >= rmax) > 0) + idx <- setdiff(unique(c(idx1, idx2)), 0) + selected <- c(selected, rownames(R[[i]])[idx]) + } + + if (rm.grey) { + sel.grey <- grepl("[A-Z]{2}grey$", selected) + selected <- selected[!sel.grey] + } + selected <- as.character(selected) + + ## --- Floor padding -------------------------------------------------------- + if (length(selected) >= min_modules) { + return(selected) + } + + me <- wgcna$net$MEs + dt <- wgcna$datTraits + if (!is.data.frame(me)) me <- tryCatch(as.data.frame(me), error = function(e) NULL) + if (!is.data.frame(dt)) dt <- tryCatch(as.data.frame(dt), error = function(e) NULL) + if (is.null(me) || is.null(dt) || ncol(me) == 0 || ncol(dt) == 0) { + message("[wgcna.getTopModules] cannot pad to floor — module-trait ", + "matrix unavailable. Returning ", length(selected), " modules.") + return(selected) + } + + Rpad <- cor(me, dt, use = "pairwise") + mx <- rowMeans(abs(Rpad), na.rm = TRUE) + mx <- mx[intersect(names(mx), non_grey)] + mx <- sort(mx, decreasing = TRUE) + + need <- min_modules - length(selected) + candidates <- setdiff(names(mx), selected) + c(selected, head(candidates, need)) +} + + +# ----------------------------------------------------------------------------- +# Ensure stats are populated (lazy fill for older PGX files) +# ----------------------------------------------------------------------------- + +#' Ensure a WGCNA object has the fields needed for downstream extraction +#' +#' Older PGX files sometimes ship without `wgcna$stats` or with empty +#' `wgcna$net$labels`, which breaks `wgcna.getGeneStats()`. This helper +#' lazily populates both. Idempotent. +#' +#' @param wgcna WGCNA result object. +#' @return The (possibly mutated) `wgcna` object. +#' @export +wgcna.ensureStats <- function(wgcna) { + needs_labels <- is.null(wgcna$net$labels) || length(wgcna$net$labels) == 0 + needs_stats <- is.null(wgcna$stats) || length(wgcna$stats) == 0 + + if (needs_labels && !is.null(wgcna$net$colors)) { + ## wgcna.getGeneStats() expects ME-prefixed labels regardless of whether + ## the colors are integer-shape (example-data) or color-name shape. + raw <- as.character(wgcna$net$colors) + needs_prefix <- !grepl("^ME", raw) + raw[needs_prefix] <- paste0("ME", raw[needs_prefix]) + wgcna$net$labels <- raw + } + + if (needs_stats) { + if (is.null(wgcna$net) || is.null(wgcna$datExpr) || is.null(wgcna$datTraits)) { + message("[wgcna.ensureStats] cannot compute stats — net/datExpr/datTraits missing") + return(wgcna) + } + tom <- if (!is.null(wgcna$TOM)) wgcna$TOM + else if (!is.null(wgcna$svTOM)) wgcna$svTOM + else if (!is.null(wgcna$wTOM)) wgcna$wTOM + else NULL + s <- tryCatch( + wgcna.computeGeneStats( + net = wgcna$net, + datExpr = wgcna$datExpr, + datTraits = wgcna$datTraits, + TOM = tom + ), + error = function(e) { + message("[wgcna.ensureStats] wgcna.computeGeneStats failed: ", + conditionMessage(e)) + NULL + } + ) + if (!is.null(s)) wgcna$stats <- s + } + + wgcna +} + + +# ----------------------------------------------------------------------------- +# Describe modules (legacy LLM helper) +# ----------------------------------------------------------------------------- + +#' Per-module LLM description helper. +#' +#' Used by `wgcna.create_report()` and other pre-AI-report-tab features. The +#' newer AI-report tab in board.wgcna does not call this; it builds its own +#' structured prompts via the omicsai stack. +#' +#' @export +wgcna.describeModules <- function(wgcna, ntop=50, psig = 0.05, + annot=NULL, multi=FALSE, modules=NULL, + experiment=NULL, verbose=1, model=DEFAULT_LLM, + docstyle = "detailed summary", numpar = 2, + level="gene") { + + if(is.null(annot)) { + message("[wgcna.describeModules] WARNING. user annot table is recommended.") + } + + if(multi) { + top <- wgcna.getMultiTopGenesAndSets(wgcna, annot=annot, ntop=ntop, + psig=psig, level=NULL, rename="gene_title") + } else { + top <- wgcna.getTopGenesAndSets(wgcna, annot=annot, ntop=ntop, + psig=psig, level=level, rename="gene_title") + } + + if(is.null(modules)) { + modules <- union(names(top$genes), names(top$sets)) + } + + if(is.null(experiment) && !is.null(wgcna$experiment)) experiment <- wgcna$experiment + if(is.null(experiment)) experiment <- "" + + if(length(modules)==0) { + info("[wgcna.describeModules] warning: empty module list!") + return(NULL) + } + + ## If no LLM is available we do just a manual summary + model <- setdiff(model, c("", NA)) + if (is.null(model) || length(model) == 0) { + desc <- list() + for(m in modules) { + ss=gg=pp=nn="" + + if(!is.null(top$genes[[m]])) { + gg <- paste( top$genes[[m]], collapse=', ') + } + if(!is.null(top$sets[[m]])) { + ss <- paste( sub(".*:","",top$sets[[m]]), collapse='; ') + } + if(m %in% names(top$pheno)) { + pp <- paste( top$pheno[[m]], collapse='; ') + } + if(m %in% names(top$neg.pheno)) { + nn <- paste( top$neg.pheno[[m]], collapse='; ') + } + d <- "" + if(!is.null(pp)) d <- paste(d, "**Positively correlated phenotypes**:", pp, "\n\n") + if(!is.null(nn)) d <- paste(d, "**Negatively correlated phenotypes**:", nn, "\n\n") + if(!is.null(gg) && gg!="") { + d <- paste(d, "**Key genes**:", gg, "\n\n") + } + if(!is.null(ss) && ss!="") { + d <- paste(d, "**Top enriched gene sets**:", ss, "\n\n") + } + desc[[m]] <- d + } + + res <- list( + prompt = NULL, + questions = NULL, + answers = desc + ) + return(res) + } + + prompt <- paste("Give a",docstyle,"of the main overall biological function of the following top enriched genesets belonging to module . After that, shortly discuss if any of these key genes/proteins/metabolites might be involved in the biological function. No need to mention all, just a few. Discuss the possible relationship with phenotypes of this experiment about \"\". Use maximum",numpar,"paragraphs. Use prose, do not use any bullet points or tables. \n\nHere is list of enriched gene sets:\n \n\n") + + prompt <- paste("These are part of the results of a WGCNA analysis of an experiment about \"\". Give a", docstyle, "of the main overall biological function of the following top enriched genesets belonging to module . Discuss the possible relationship with positively correlated phenotypes and, if not obvious, negatively correlated phenotypes . Use maximum", numpar, "paragraphs. Do not use any bullet points. \n\nHere is list of enriched gene sets: \n") + + if (verbose > 1) cat(prompt) + + desc <- list() + questions <- list() + for (k in modules) { + if (verbose > 0) message("Describing module ", k) + + ss=gg=pp=nn="" + if(length(top$sets[[k]])>0) { + ss <- sub( ".*:","", top$sets[[k]] ) + ss <- paste(ss, collapse=';') + } else { + ss <- "[no significant genesets]" + } + + if(k %in% names(top$pheno)) { + pp <- paste0("'",top$pheno[[k]],"'") + pp <- paste( pp, collapse=';') + } + if(k %in% names(top$neg.pheno)) { + nn <- paste0("'",top$neg.pheno[[k]],"'") + nn <- paste( nn, collapse=';') + } + + q <- prompt + + if(length(top$genes[[k]])>0) { + gg <- paste( top$genes[[k]], collapse=';') + q <- paste(q, "\nHere is the list of key genes/proteins/metabolites, or so-called 'features'. Only use features that are in this list in your answer. Do not mention features not in this list. : \n") + } + + q <- sub("", k, q) + q <- sub("", pp, q) + q <- sub("", nn, q) + q <- sub("", experiment, q) + q <- sub("", ss, q) + q <- sub("", gg, q) + + answer <- "" + for (m in model) { + if (verbose > 0) message(" ...asking LLM model ", m) + a <- ai.ask(q, model = m) + a <- paste0(a, "\n\n[AI generated using ", m, "]\n") + if (length(model) > 1) a <- paste0("\n-------------------------------\n\n", a) + answer <- paste0(answer, a) + } + + desc[[k]] <- answer + questions[[k]] <- q + } + + res <- list( + prompt = prompt, + questions = questions, + answers = desc + ) + return(res) +} diff --git a/R/pgx-wgcna.R b/R/pgx-wgcna.R index 89a38be3..d9ef8fed 100644 --- a/R/pgx-wgcna.R +++ b/R/pgx-wgcna.R @@ -5646,403 +5646,6 @@ wgcna.scaleTOMs <- function(TOMs, scaleP = 0.95) { return(TOMs) } -wgcna.get_modTraits <- function(wgcna) { - if (!is.null(wgcna$modTraits)) { - M <- wgcna$modTraits - } else { - M <- cor(wgcna$net$MEs, wgcna$datTraits, use = "pairwise") - } - M[is.na(M)] <- 0 - return(M) -} - -#' @export -wgcna.getTopGenesAndSets <- function(wgcna, annot=NULL, module=NULL, ntop=40, - psig = 0.05, level="gene", rename="symbol") { - - is.consensus <- "layers" %in% names(wgcna) && class(wgcna$datExpr) == "list" - is.multi <- "layers" %in% names(wgcna) && is.null(wgcna$datExpr) - - if(is.consensus) { - top <- wgcna.getConsensusTopGenesAndSets(wgcna, annot=annot, - module=module, ntop=ntop, rename=rename) - return(top) - } - - if(is.multi) { - top <- wgcna.getMultiTopGenesAndSets( - wgcna$layers, annot=annot, module=module, psig=psig, ntop=ntop, - level=level, rename=rename) - return(top) - } - - stats <- NULL - if (!"stats" %in% names(wgcna)) { - stats <- wgcna.computeGeneStats( - wgcna$net, wgcna$datExpr, wgcna$datTraits, - wgcna$svTOM - ) - } else { - stats <- wgcna$stats - } - if (!any(c("gse", "gsea") %in% names(wgcna))) { - warning("object has no enrichment results (gsea)") - } - - ## get top genes by centrality-weighted-meanFC2 - mm <- stats$moduleMembership - mm.sig <- 1 * (stats$MMPvalue <= psig) - ff <- sqrt(rowMeans(stats$foldChange**2, na.rm = TRUE)) - mm <- mm * mm.sig * ff - if (!is.null(annot)) { - annot$gene_title <- paste0(annot$gene_title, " (", annot$symbol, ")") - mm <- rename_by2(mm, annot, new_id = rename) - } - gg <- rownames(mm) - mm <- as.list(data.frame(mm)) - if (!is.null(module)) mm <- mm[which(names(mm) %in% module)] - for (i in 1:length(mm)) names(mm[[i]]) <- gg - mm <- lapply(mm, function(x) x[x != 0]) - topgenes <- lapply(mm, function(x) names(head(sort(-x), ntop))) - - ## top genesets - topsets <- NULL - if (any(c("gse", "gsea") %in% names(wgcna))) { - if (!is.null(wgcna$gsea)) ee <- wgcna$gsea - if (!is.null(wgcna$gse)) ee <- wgcna$gse - if (!is.null(module)) ee <- ee[which(names(ee) %in% module)] - topsets <- lapply(ee, function(x) head(rownames(x), ntop)) - } - - ## top correlated phenotypes - M <- wgcna.get_modTraits(wgcna) - top.pheno <- apply(M, 1, function(x) names(which(x > 0.8*max(x, na.rm=TRUE)))) - top.negpheno <- apply(M, 1, function(x) names(which(x < 0.8*min(x, na.rm=TRUE)))) - - if(level=="geneset") { - topsets <- topgenes - topgenes <- NULL - } - - list(sets = topsets, genes = topgenes, pheno = top.pheno, neg.pheno = top.negpheno) -} - -#' @export -wgcna.getMultiTopGenesAndSets <- function(multi_wgcna, annot = NULL, module = NULL, - psig = 0.05, ntop = 40, level = NULL, - rename = "symbol") { - if ("layers" %in% names(multi_wgcna)) { - multi_wgcna <- multi_wgcna$layers - } - - ## set level - nw <- length(multi_wgcna) - if (!is.null(level)) { - level <- head(rep(level, nw), nw) - } else { - level <- c("gene", "geneset")[1 + 1 * grepl("^gs|^gset|geneset", names(multi_wgcna))] - } - names(level) <- names(multi_wgcna) - - toplist <- list() - k <- names(multi_wgcna)[1] - for (k in names(multi_wgcna)) { - topk <- wgcna.getTopGenesAndSets( - multi_wgcna[[k]], - module = module, annot = annot, - ntop = ntop, psig = psig, level = level[[k]], rename = rename - ) - if (!is.null(module)) { - topk <- lapply(topk, function(s) s[which(names(s) %in% module)]) - } - toplist[[k]] <- topk - } - - top <- list() - top$genes <- lapply(toplist, function(t) t[["genes"]]) - names(top$genes) <- NULL - top$genes <- unlist(top$genes, recursive = FALSE) - - top$sets <- lapply(toplist, function(t) t[["sets"]]) - names(top$sets) <- NULL - top$sets <- unlist(top$sets, recursive = FALSE) - - top$pheno <- lapply(toplist, function(t) t[["pheno"]]) - names(top$pheno) <- NULL - top$pheno <- unlist(top$pheno, recursive = FALSE) - - top$neg.pheno <- lapply(toplist, function(t) t[["neg.pheno"]]) - names(top$neg.pheno) <- NULL - top$neg.pheno <- unlist(top$neg.pheno, recursive = FALSE) - - return(top) -} - - -#' @export -wgcna.getConsensusTopGenesAndSets <- function(cons, annot = NULL, module = NULL, ntop = 40, - level = c("gene", "geneset")[1], - rename = "symbol") { - if (!"stats" %in% names(cons)) stop("object has no stats") - if (!any(c("gse", "gsea") %in% names(cons))) { - warning("object has no enrichment results (gsea)") - } - - if (!is.null(annot)) { - annot$gene_title <- paste0(annot$gene_title, " (", annot$symbol, ")") - } - - ## get top genes (highest kME) - topgenesx <- list() - for (i in 1:length(cons$stats)) { - mm <- cons$stats[[i]]$moduleMembership - if (!is.null(annot)) { - mm <- rename_by2(mm, annot, rename) - } - gg <- rownames(mm) - mm <- as.list(data.frame(mm)) - if (!is.null(module)) mm <- mm[module] - sel.topgenes <- lapply(mm, function(x) head(order(-x), 3 * ntop)) - topgenesx[[i]] <- lapply(sel.topgenes, function(i) gg[i]) - } - - ## intersect topgenes across all datatypes - topgenes <- topgenesx[[1]] - k <- 2 - for (k in 2:length(topgenesx)) { - topgenes <- mapply(intersect, topgenes, topgenesx[[k]], SIMPLIFY = FALSE) - } - topgenes <- lapply(topgenes, head, ntop) - - if (!is.null(module)) { - sel <- intersect(names(topgenes), module) - topgenes <- topgenes[sel] - } - - ## top genesets (as symbol!) - topsets <- NULL - if (any(c("gse", "gsea") %in% names(cons))) { - if (!is.null(cons$gsea)) ee <- cons$gsea - if (!is.null(cons$gse)) ee <- cons$gse - ee <- ee[match(names(topgenes), names(ee))] - names(ee) <- names(topgenes) - topsets <- lapply(ee, function(x) head(rownames(x), ntop)) - } - - ## module traits - M <- lapply(cons$net$multiMEs, function(x) as.matrix(x$data)) - Y <- lapply(M, function(m) cons$datTraits[rownames(m),]) - R <- mapply( function(x,y) abs(cor(x,y,use="pairwise")), M, Y, SIMPLIFY=FALSE) - R <- Reduce('+', R) - top.pheno <- apply(R, 1, function(x) names(which(x > 0.8*max(x,na.rm=TRUE))), - simplify = FALSE) - top.negpheno <- apply(R, 1, function(x) names(which(x < 0.8*min(x,na.rm=TRUE))), - simplify = FALSE) - - if (level == "geneset") { - topsets <- topgenes - topgenes <- NULL - } - - list(sets = topsets, genes = topgenes, pheno = top.pheno, neg.pheno = top.negpheno) -} - -## ---------------------------------------------------------------------- -## ---------------------------------------------------------------------- -## ---------------------------------------------------------------------- - -#' @export -wgcna.describeModules <- function(wgcna, ntop=50, psig = 0.05, - annot=NULL, multi=FALSE, modules=NULL, - experiment=NULL, verbose=1, model=DEFAULT_LLM, - docstyle = "detailed summary", numpar = 2, - level = "gene") { - if (is.null(annot)) { - message("[wgcna.describeModules] WARNING. user annot table is recommended.") - } - - if (multi) { - top <- wgcna.getMultiTopGenesAndSets(wgcna, - annot = annot, ntop = ntop, - psig = psig, level = NULL, rename = "gene_title" - ) - } else { - top <- wgcna.getTopGenesAndSets(wgcna, - annot = annot, ntop = ntop, - psig = psig, level = level, rename = "gene_title" - ) - } - - if (is.null(modules)) { - modules <- union(names(top$genes), names(top$sets)) - } - - if(is.null(experiment) && !is.null(wgcna$experiment)) experiment <- wgcna$experiment - if(is.null(experiment)) experiment <- "" - ##if(!is.null(top$genes)) modules <- intersect(modules, names(top$genes)) - ##if(!is.null(top$sets)) modules <- intersect(modules, names(top$sets)) - ##modules <- intersect(modules, names(top$pheno)) - - if (length(modules) == 0) { - info("[wgcna.describeModules] warning: empty module list!") - return(NULL) - } - - ## If no LLM is available we do just a manual summary - model <- setdiff(model, c("", NA)) - if (is.null(model) || length(model) == 0) { - desc <- list() - for(m in modules) { - ss=gg=pp=nn="" - - if(!is.null(top$genes[[m]])) { - gg <- paste( top$genes[[m]], collapse=', ') - } - if(!is.null(top$sets[[m]])) { - ss <- paste( sub(".*:","",top$sets[[m]]), collapse='; ') - } - if(m %in% names(top$pheno)) { - pp <- paste( top$pheno[[m]], collapse='; ') - } - if(m %in% names(top$neg.pheno)) { - nn <- paste( top$neg.pheno[[m]], collapse='; ') - } - d <- "" - if(!is.null(pp)) d <- paste(d, "**Positively correlated phenotypes**:", pp, "\n\n") - if(!is.null(nn)) d <- paste(d, "**Negatively correlated phenotypes**:", nn, "\n\n") - if(!is.null(gg) && gg!="") { - d <- paste(d, "**Key genes**:", gg, "\n\n") - } - if (!is.null(ss) && ss != "") { - d <- paste(d, "**Top enriched gene sets**:", ss, "\n\n") - } - desc[[m]] <- d - } - - res <- list( - prompt = NULL, - questions = NULL, - answers = desc - ) - return(res) - } - - prompt <- paste("Give a", docstyle, "of the main overall biological function of the following top enriched genesets belonging to module . After that, shortly discuss if any of these key genes/proteins/metabolites might be involved in the biological function. No need to mention all, just a few. Discuss the possible relationship with phenotypes of this experiment about \"\". Use maximum", numpar, "paragraphs. Use prose, do not use any bullet points or tables. \n\nHere is list of enriched gene sets:\n \n\n") - - prompt <- paste("These are part of the results of a WGCNA analysis of an experiment about \"\". Give a", docstyle, "of the main overall biological function of the following top enriched genesets belonging to module . Discuss the possible relationship with positively correlated phenotypes and, if not obvious, negatively correlated phenotypes . Use maximum", numpar, "paragraphs. Do not use any bullet points. \n\nHere is list of enriched gene sets: \n") - - if (verbose > 1) cat(prompt) - - desc <- list() - questions <- list() - for (k in modules) { - if (verbose > 0) message("Describing module ", k) - - ss=gg=pp=nn="" - if(length(top$sets[[k]])>0) { - ss <- sub( ".*:","", top$sets[[k]] ) ## strip prefix - ss <- paste(ss, collapse=';') - } else { - ss <- "[no significant genesets]" - } - - if (k %in% names(top$pheno)) { - pp <- paste0("'", top$pheno[[k]], "'") - pp <- paste(pp, collapse = ";") - } - if(k %in% names(top$neg.pheno)) { - nn <- paste0("'",top$neg.pheno[[k]],"'") - nn <- paste( nn, collapse=';') - } - - q <- prompt - - if(length(top$genes[[k]])>0) { - gg <- paste( top$genes[[k]], collapse=';') - ## strongly discourage use of gene in other modules - q <- paste(q, "\nHere is the list of key genes/proteins/metabolites, or so-called 'features'. Only use features that are in this list in your answer. Do not mention features not in this list. : \n") - } - - q <- sub("", k, q) - q <- sub("", pp, q) - q <- sub("", nn, q) - q <- sub("", experiment, q) - q <- sub("", ss, q) - q <- sub("", gg, q) - - answer <- "" - for (m in model) { - if (verbose > 0) message(" ...asking LLM model ", m) - a <- ai.ask(q, model = m) - a <- paste0(a, "\n\n[AI generated using ", m, "]\n") - if (length(model) > 1) a <- paste0("\n-------------------------------\n\n", a) - answer <- paste0(answer, a) - } - - desc[[k]] <- answer - questions[[k]] <- q - } - - res <- list( - prompt = prompt, - questions = questions, - answers = desc - ) - return(res) -} - -#' @export -wgcna.getTopModules <- function(wgcna, topratio=0.85, kx=NULL, rm.grey=TRUE, - psig=0.05, minrho=0.1, multi=NULL) { - - if(!is.null(kx)) dbg("[wgcna.getTopModules] WARNING: kx parameter is deprecated") - - if(is.null(topratio)) topratio <- 0.85 - if(is.null(multi) && !is.null(wgcna$layers)) multi <- TRUE - if(is.null(multi)) multi <- FALSE - if(!multi) { - ww <- list(gx = wgcna) ## single-omics wgcna object - } else if(!is.null(wgcna$layers)) { - ww <- wgcna$layers - } else { - ww <- wgcna - } - - ## compute module-trait correlation and p-value - R <- list() - P <- list() - i=1 - for(i in 1:length(ww)) { - me <- ww[[i]]$net$MEs - dt <- ww[[i]]$datTraits - R1 <- cor(me, dt, use="pairwise") - ndim <- colSums(!is.na(dt)) - P1 <- sapply(1:ncol(dt), function(j) cor.pvalue(R1[,j],ndim[j])) - colnames(P1) <- colnames(dt) - R[[i]] <- R1 - P[[i]] <- P1 - } - - ## As top modules, we take all modules that are significantly - ## correlated with at least one phenotype - top.modules <- c() - i=1 - for(i in 1:length(R)) { - idx1 <- which(rowSums(P[[i]] <= psig) > 0) - rmax <- topratio * pmax(apply(R[[i]],2,max,na.rm=TRUE),0) - rmax <- pmax(rmax, minrho) - idx2 <- which(colSums(t(R[[i]]) >= rmax)>0) - idx <- setdiff(unique(c(idx1,idx2)),0) - tt <- rownames(R[[i]])[idx] - top.modules <- c(top.modules, tt) - } - - if (rm.grey) { - sel.grey <- grepl("[A-Z]{2}grey$", top.modules) - top.modules <- top.modules[!sel.grey] - } - top.modules -} #' Create report #' diff --git a/R/utils.R b/R/utils.R index 3be246ec..834ab713 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,8 @@ +## Null-coalescing operator. Returns `a` if not NULL, else `b`. +## Mirrors base R 4.4+ `%||%` for backward compatibility (playbase Depends R >= 3.5). +`%||%` <- function(a, b) if (is.null(a)) b else a + + mem.vmrss <- function(digits = 0) { mem <- "[? MB]" if (Sys.info()["sysname"] %in% c("Linux")) { diff --git a/man/compute_extra.Rd b/man/compute_extra.Rd index 5496d322..1be8e472 100644 --- a/man/compute_extra.Rd +++ b/man/compute_extra.Rd @@ -7,7 +7,7 @@ compute_extra( pgx, extra = c("meta.go", "infer", "deconv", "drugs", "connectivity", "wordcloud", "wgcna", - "mofa"), + "wgcna_mox", "mofa"), sigdb = NULL, pgx.dir = "./data", libx.dir = "./libx", diff --git a/man/drugs.analysisInfo.Rd b/man/drugs.analysisInfo.Rd new file mode 100644 index 00000000..a319f804 --- /dev/null +++ b/man/drugs.analysisInfo.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-drugs-report.R +\name{drugs.analysisInfo} +\alias{drugs.analysisInfo} +\title{NES direction semantics for a drug connectivity analysis type} +\usage{ +drugs.analysisInfo(analysis_type) +} +\arguments{ +\item{analysis_type}{Character. Name of the analysis backing the drug +enrichment result, e.g. \code{"L1000/activity"}, \code{"L1000/gene"}, +\code{"CTRPv2/sensitivity"}, \code{"GDSC/sensitivity"}.} +} +\value{ +A list with two character scalars: + \describe{ + \item{\code{analysis_type}}{the input value (verbatim, or \code{"unknown"}).} + \item{\code{analysis_type_description}}{a paragraph describing the + data source and the direction semantics of NES for that backend.} + } +} +\description{ +Returns a deterministic, human-readable description of a + drug connectivity analysis type, including the meaning of positive vs. + negative NES. L1000-based analyses interpret negative NES as opposition + (reversal candidate); pharmacological sensitivity analyses (CTRPv2, + GDSC) interpret positive NES as predicted drug vulnerability. +} +\details{ +The returned description is intended for inclusion in AI report + prompts so the language model interprets NES sign correctly for the + selected backend. Matching is case-insensitive. +} diff --git a/man/drugs.enrichmentTable.Rd b/man/drugs.enrichmentTable.Rd new file mode 100644 index 00000000..8dca0c22 --- /dev/null +++ b/man/drugs.enrichmentTable.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-drugs-report.R +\name{drugs.enrichmentTable} +\alias{drugs.enrichmentTable} +\title{Build a drug enrichment (DSEA) table for one contrast/method} +\usage{ +drugs.enrichmentTable( + pgx, + method, + contrast, + annot = NULL, + only_annotated = FALSE +) +} +\arguments{ +\item{pgx}{A pgx object holding pre-computed drug enrichment matrices in +\code{pgx$drugs[[method]]} (X = NES, P = pvalue, Q = qvalue, annot).} + +\item{method}{Character. Name of the entry in \code{pgx$drugs} to read.} + +\item{contrast}{Character. Contrast name (column of \code{pgx$drugs[[method]]$X}).} + +\item{annot}{Optional data.frame of drug annotations with at least the +columns \code{moa} and \code{target} and row names matching drug names. +If \code{NULL}, the function falls back to +\code{pgx$drugs[[method]]$annot}.} + +\item{only_annotated}{Logical. If \code{TRUE}, drop drugs that have empty +MOA and target after the join.} +} +\value{ +A data.frame ordered by descending \code{|NES|}, with columns + \code{drug}, \code{NES}, \code{pval}, \code{padj}, \code{moa}, \code{target}. + Returns \code{NULL} if the inputs are missing or the contrast does not + exist. +} +\description{ +Extracts a single-contrast drug enrichment table from + pre-computed \code{pgx$drugs[[method]]} matrices, joining drug-level + MOA and target annotations when available. This is the input + expected by \code{\link{drugs.moaEnrichment}}. +} +\details{ +Compared to \code{\link{pgx.getDrugConnectivityTable}}, this + function accepts an explicit \code{annot} argument so callers can + supply annotations from an external source when + \code{pgx$drugs[[method]]$annot} is missing. NES/pval/padj are + rounded to four digits; missing NES are coerced to 0 and missing + p/q-values to 1. +} diff --git a/man/drugs.moaEnrichment.Rd b/man/drugs.moaEnrichment.Rd new file mode 100644 index 00000000..f6e2791e --- /dev/null +++ b/man/drugs.moaEnrichment.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-drugs-report.R +\name{drugs.moaEnrichment} +\alias{drugs.moaEnrichment} +\title{On-the-fly MOA / target enrichment via fgsea} +\usage{ +drugs.moaEnrichment(dsea_table, field = c("moa", "target"), nperm = 5000) +} +\arguments{ +\item{dsea_table}{A data.frame produced by \code{\link{drugs.enrichmentTable}}, +with at least the columns \code{NES}, \code{moa}, \code{target} and +drug names as row names.} + +\item{field}{One of \code{"moa"} or \code{"target"}. Selects the +annotation column to tokenise into pseudo gene sets.} + +\item{nperm}{Integer. Number of fgsea permutations.} +} +\value{ +A data.frame of fgsea results (\code{pathway}, \code{NES}, + \code{pval}, \code{padj}, \code{size}, ...) ordered by descending + \code{|NES|}. Returns \code{NULL} if the input is empty or no + tokens are found. +} +\description{ +Computes MOA-level or target-level enrichment by treating + each annotation token as a pseudo gene set over the drug-ranked NES + vector. Unlike \code{\link{pgx.getDrugMOATable}}, which requires + \code{pgx$drugs[[method]]$moa} to be pre-populated, this function + recomputes the enrichment on demand from the drug-level table. +} +\details{ +Tokens are extracted from the selected annotation column by + splitting on \code{|}, \code{;} or \code{,}. Empty / \code{NA} / + \code{"N/A"} tokens are dropped. Each unique token defines a gene set + (the set of drugs annotated with that token), and fgsea ranks them + against the NES vector. Useful for AI report generation where the + board only has the drug-level table and needs MOA-level signal on the + fly. +} diff --git a/man/drugs.supportBucket.Rd b/man/drugs.supportBucket.Rd new file mode 100644 index 00000000..24e46a49 --- /dev/null +++ b/man/drugs.supportBucket.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-drugs-report.R +\name{drugs.supportBucket} +\alias{drugs.supportBucket} +\title{Classify a result by significance support bucket} +\usage{ +drugs.supportBucket(padj, pval = NA_real_) +} +\arguments{ +\item{padj}{Numeric. Adjusted p-value (q-value) of one result.} + +\item{pval}{Numeric. Raw p-value of the same result. Used as a +fallback when \code{padj} is \code{NA} or above 0.05.} +} +\value{ +A character scalar: \code{"significant"} when \code{padj < 0.05}, + \code{"nominal"} when only \code{pval < 0.05}, otherwise + \code{"unsupported"}. +} +\description{ +Tier classifier used to gate evidence buckets in AI report + generation: significant results drive headline claims, nominal + results are mentioned with hedging, unsupported results are + suppressed. +} diff --git a/man/pgx.update_infographics.Rd b/man/pgx.update_infographics.Rd index 6a842d5e..fe2596a6 100644 --- a/man/pgx.update_infographics.Rd +++ b/man/pgx.update_infographics.Rd @@ -8,6 +8,7 @@ pgx.update_infographics( pgx, llm_model, img_model, + select = c("wgcna", "mofa", "cmap", "summary"), force = FALSE, progress = NULL ) diff --git a/man/plotPCSF.IGRAPH.Rd b/man/plotPCSF.IGRAPH.Rd index 1a9ce715..b9298c5b 100644 --- a/man/plotPCSF.IGRAPH.Rd +++ b/man/plotPCSF.IGRAPH.Rd @@ -17,9 +17,6 @@ plotPCSF.IGRAPH( ... ) } -\arguments{ -\item{x}{} -} \description{ Plot static igraph PCSF graph from result of pgx.computePCSF. To be called from plotPCSF. diff --git a/man/wgcna.describeModules.Rd b/man/wgcna.describeModules.Rd new file mode 100644 index 00000000..11bbac0a --- /dev/null +++ b/man/wgcna.describeModules.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.describeModules} +\alias{wgcna.describeModules} +\title{Per-module LLM description helper.} +\usage{ +wgcna.describeModules( + wgcna, + ntop = 50, + psig = 0.05, + annot = NULL, + multi = FALSE, + modules = NULL, + experiment = NULL, + verbose = 1, + model = DEFAULT_LLM, + docstyle = "detailed summary", + numpar = 2, + level = "gene" +) +} +\description{ +Used by `wgcna.create_report()` and other pre-AI-report-tab features. The +newer AI-report tab in board.wgcna does not call this; it builds its own +structured prompts via the omicsai stack. +} diff --git a/man/wgcna.ensureStats.Rd b/man/wgcna.ensureStats.Rd new file mode 100644 index 00000000..a36e5aca --- /dev/null +++ b/man/wgcna.ensureStats.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.ensureStats} +\alias{wgcna.ensureStats} +\title{Ensure a WGCNA object has the fields needed for downstream extraction} +\usage{ +wgcna.ensureStats(wgcna) +} +\arguments{ +\item{wgcna}{WGCNA result object.} +} +\value{ +The (possibly mutated) `wgcna` object. +} +\description{ +Older PGX files sometimes ship without `wgcna$stats` or with empty +`wgcna$net$labels`, which breaks `wgcna.getGeneStats()`. This helper +lazily populates both. Idempotent. +} diff --git a/man/wgcna.getConsensusTopGenesAndSets.Rd b/man/wgcna.getConsensusTopGenesAndSets.Rd new file mode 100644 index 00000000..e10de2df --- /dev/null +++ b/man/wgcna.getConsensusTopGenesAndSets.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.getConsensusTopGenesAndSets} +\alias{wgcna.getConsensusTopGenesAndSets} +\title{Top genes/sets/phenotypes for a consensus WGCNA object.} +\usage{ +wgcna.getConsensusTopGenesAndSets( + cons, + annot = NULL, + module = NULL, + ntop = 40, + level = c("gene", "geneset")[1], + rename = "symbol" +) +} +\description{ +Top genes/sets/phenotypes for a consensus WGCNA object. +} diff --git a/man/wgcna.getMultiTopGenesAndSets.Rd b/man/wgcna.getMultiTopGenesAndSets.Rd new file mode 100644 index 00000000..c2ebb399 --- /dev/null +++ b/man/wgcna.getMultiTopGenesAndSets.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.getMultiTopGenesAndSets} +\alias{wgcna.getMultiTopGenesAndSets} +\title{Top genes/sets/phenotypes for a multi-omics WGCNA object (layered).} +\usage{ +wgcna.getMultiTopGenesAndSets( + multi_wgcna, + annot = NULL, + module = NULL, + psig = 0.05, + ntop = 40, + level = NULL, + rename = "symbol" +) +} +\description{ +Top genes/sets/phenotypes for a multi-omics WGCNA object (layered). +} diff --git a/man/wgcna.getTopGenesAndSets.Rd b/man/wgcna.getTopGenesAndSets.Rd new file mode 100644 index 00000000..98171d0d --- /dev/null +++ b/man/wgcna.getTopGenesAndSets.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.getTopGenesAndSets} +\alias{wgcna.getTopGenesAndSets} +\title{Top genes, sets, and phenotypes per module} +\usage{ +wgcna.getTopGenesAndSets( + wgcna, + annot = NULL, + module = NULL, + ntop = 40, + psig = 0.05, + level = "gene", + rename = "symbol" +) +} +\arguments{ +\item{wgcna}{WGCNA result object.} + +\item{annot}{Annotation table; optional but recommended.} + +\item{module}{Optional module subset.} + +\item{ntop}{Integer; top N per module.} + +\item{psig}{Numeric; p-value cutoff for moduleMembership significance.} + +\item{level}{"gene" or "geneset".} + +\item{rename}{Annotation column to use when renaming features.} +} +\value{ +list(sets, genes, pheno, neg.pheno). +} +\description{ +Dispatch is shape-based: consensus objects (with `$layers` and a list-valued +`$datExpr`) go to `wgcna.getConsensusTopGenesAndSets()`; multi-omics objects +(with `$layers` but no `$datExpr`) go to `wgcna.getMultiTopGenesAndSets()`; +everything else is treated as single-omics. +} diff --git a/man/wgcna.getTopModules.Rd b/man/wgcna.getTopModules.Rd new file mode 100644 index 00000000..50f9b743 --- /dev/null +++ b/man/wgcna.getTopModules.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.getTopModules} +\alias{wgcna.getTopModules} +\title{Select top modules with a minimum-count floor} +\usage{ +wgcna.getTopModules( + wgcna, + min_modules = 5L, + topratio = 0.85, + psig = 0.05, + minrho = 0.1, + rm.grey = TRUE, + multi = NULL +) +} +\arguments{ +\item{wgcna}{WGCNA result object (single-omics or layered).} + +\item{min_modules}{Integer; minimum number of modules to return.} + +\item{topratio}{Numeric.} + +\item{psig}{Numeric.} + +\item{minrho}{Numeric.} + +\item{rm.grey}{Logical; drop grey from the selection.} + +\item{multi}{Logical or NULL; force multi-layer dispatch.} +} +\value{ +Character vector of module names. +} +\description{ +A module qualifies if (a) it has at least one trait p-value below `psig`, or +(b) its row in the module-trait correlation matrix has at least one column +meeting `topratio * max(R)` (clipped at `minrho`). +} +\details{ +If the resulting selection has fewer than `min_modules` non-grey modules, +the selection is padded with the next-strongest non-grey modules ranked by +`rowMeans(abs(R))`. If the input has `<= min_modules` non-grey modules in +total, all non-grey modules are returned unfiltered. + +Replaces the older `wgcna.getTopModules()` variant whose `kx` argument was +deprecated; the new signature drops `kx` entirely and adds `min_modules`. +} diff --git a/man/wgcna.get_modTraits.Rd b/man/wgcna.get_modTraits.Rd new file mode 100644 index 00000000..cdd82e5c --- /dev/null +++ b/man/wgcna.get_modTraits.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pgx-wgcna-report.R +\name{wgcna.get_modTraits} +\alias{wgcna.get_modTraits} +\title{Module-trait correlation matrix} +\usage{ +wgcna.get_modTraits(wgcna) +} +\arguments{ +\item{wgcna}{WGCNA result object.} +} +\value{ +Numeric matrix (modules × traits). +} +\description{ +Returns the cached `wgcna$modTraits` matrix when present, otherwise computes +the Pearson correlation between module eigengenes (`wgcna$net$MEs`) and the +trait matrix (`wgcna$datTraits`). NA cells are filled with zero. +} diff --git a/man/wgcna.plotEigenGeneAdjacencyHeatmap.Rd b/man/wgcna.plotEigenGeneAdjacencyHeatmap.Rd index 66aff6ef..6468a800 100644 --- a/man/wgcna.plotEigenGeneAdjacencyHeatmap.Rd +++ b/man/wgcna.plotEigenGeneAdjacencyHeatmap.Rd @@ -23,7 +23,7 @@ wgcna.plotEigenGeneAdjacencyHeatmap( mar1 = c(5.6, 4.5, 1.8, 0), mar2 = c(8, 10, 4, 2), cex.lab = 0.8, - cex.text = 0.7, + cex.text = 0.6, plotDendro = TRUE, plotHeatmap = TRUE, dendro.horiz = TRUE,