Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions R/dataProcess.R
Original file line number Diff line number Diff line change
Expand Up @@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein,
}]

if (is_labeled_reference) {
single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
single_protein[(censored) & is_labeled_ref == FALSE,
newABUNDANCE := predicted]
} else {
single_protein[, predicted := ifelse(censored, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
single_protein[!(censored), predicted := NA]
single_protein[(censored), newABUNDANCE := predicted]
}

survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
Comment thread
tonywu1999 marked this conversation as resolved.
} else {
survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
survival[, predicted := NA]
}

if (all(!is.na(single_protein$ANOMALYSCORES))) {
single_protein[, weights :=
anomaly_weights_z_vec(ANOMALYSCORES),
Expand Down Expand Up @@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
}

if (is_labeled_reference) {
single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
single_protein[(censored) & is_labeled_ref == FALSE,
newABUNDANCE := predicted]
} else {
single_protein[, predicted := ifelse(censored, predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
single_protein[!(censored), predicted := NA]
single_protein[(censored),
newABUNDANCE := predicted]
}
survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
} else {
Comment thread
tonywu1999 marked this conversation as resolved.
Expand Down
11 changes: 6 additions & 5 deletions R/utils_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,10 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
cols = toupper(cols)
cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
colnames(input))
input = input[, cols, with = FALSE]

input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
drop_cols = setdiff(colnames(input), cols)
for (col in drop_cols) data.table::set(input, j = col, value = NULL)

input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
Comment thread
mstaniak marked this conversation as resolved.
Outdated
input$PRECURSORCHARGE, sep = "_")
input$TRANSITION = paste(input$FRAGMENTION,
input$PRODUCTCHARGE, sep = "_")
Expand Down Expand Up @@ -322,8 +323,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
input[, PROTEIN := factor(PROTEIN)]
input[, PEPTIDE := factor(PEPTIDE)]
input[, TRANSITION := factor(TRANSITION)]
input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
RUN, PROTEIN, PEPTIDE, TRANSITION), ]
data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
RUN, PROTEIN, PEPTIDE, TRANSITION)
input[, GROUP := factor(GROUP)]
input[, SUBJECT := factor(SUBJECT)]
input[, FEATURE := factor(FEATURE)]
Expand Down
26 changes: 10 additions & 16 deletions R/utils_feature_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,28 +74,22 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @return data.table
#' @keywords internal
.selectHighQualityFeatures = function(input, min_feature_count) {
PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
is_obs = log2inty = LABEL = NULL

cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL",
"ABUNDANCE", "censored")
cols = intersect(cols, colnames(input))
input = input[, cols, with = FALSE]
if (!("censored" %in% cols)) {
input$censored = FALSE
}
data.table::setnames(input, "censored", "is_censored")
PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
is_obs = log2inty = is_censored = LABEL = NULL

has_censored = is.element("censored", colnames(input))
input = input[, list(protein = as.character(PROTEIN),
peptide = as.character(PEPTIDE),
feature = as.character(FEATURE),
run = as.character(originalRUN),
label = as.character(LABEL),
log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
ABUNDANCE, NA),
is_censored)]
input[, is_obs := !(is.na(log2inty) | is_censored)]
log2inty = ABUNDANCE,
is_censored = if (has_censored) censored else FALSE)]
# Censored or missing intensities are not observations.
input[is_censored | is.na(log2inty), log2inty := NA]

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be is_censored & is.na(log2inty)? Because the negation of !(is.na(ABUNDANCE) | is_censored) switches the OR to an AND

input[, is_obs := !is.na(log2inty)]
input[, is_censored := NULL]

features_quality = data.table::rbindlist(lapply(split(input, input$label),
.flagUninformativeSingleLabel,
min_feature_count = min_feature_count))
Expand Down
64 changes: 41 additions & 23 deletions R/utils_normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
by = "FRACTION"]
input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
with = FALSE]
data.table::set(input, j = "ABUNDANCE_RUN", value = NULL)
data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL)
getOption("MSstatsLog")("Normalization based on median: OK")
input
}
Expand Down Expand Up @@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction]

getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay")
input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE]
data.table::set(input, j = "median_by_run", value = NULL)
data.table::set(input, j = "median_by_fraction", value = NULL)
input
}


Expand All @@ -282,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#'
MSstatsMergeFractions = function(input) {
ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL
originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL
originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL
ncount = FEATURE = NULL

input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0
Expand Down Expand Up @@ -338,29 +340,45 @@ MSstatsMergeFractions = function(input) {
getOption("MSstatsLog")("ERROR", msg)
stop(msg)
} else {
match_runs[, merged := "merged"]
match_runs[, newRun := do.call(paste, c(.SD, sep = "_")),
.SDcols = c(1:3, ncol(match_runs))]
match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
SUBJECT_ORIGINAL,
newRun)])

input = merge(input, match_runs,
by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
all.x = TRUE)
# dcast pivoted fractions into columns, so the fraction columns
# are everything that is not a sample-identifier column.
fraction_cols = setdiff(colnames(match_runs),
c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"))
# Use the first fraction's run as the sample's representative run.
first_fraction_run = match_runs[[fraction_cols[1]]]
# Build the merged-run name: <group>_<subject>_<run>_merged.
match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
first_fraction_run, "merged",
sep = "_")]
# Reduce to a (group, subject) -> merged-run lookup table.
match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
newRun)]
# For each input row, find its sample's row in the lookup.
nr_idx = match_runs[input,
on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
which = TRUE, mult = "first"]
# Write that merged-run name onto every input row.
data.table::set(input, j = "newRun",
value = match_runs$newRun[nr_idx])
# Count, per feature/fraction, the rows observed above zero.
select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
list(ncount = .N),
by = c("FEATURE", "FRACTION")]
# Keep only feature/fraction combinations seen at least once.
select_fraction = select_fraction[ncount != 0]
select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
input = input[tmp %in% select_fraction$tmp, ]
input$originalRUN = input$newRun
input$RUN = input$originalRUN
input$RUN = factor(input$RUN, levels = unique(input$RUN),
labels = seq_along(unique(input$RUN)))
input = input[, !(colnames(input) %in% c('tmp','newRun')),
with = FALSE]
# Mark which input rows belong to a kept combination (NA = none).
keep_idx = select_fraction[input,
on = c("FEATURE", "FRACTION"),
which = TRUE, mult = "first"]
# Drop rows whose feature/fraction was never observed.
input = input[!is.na(keep_idx)]
# The merged run replaces the original per-fraction run.
input[, originalRUN := newRun]
# Renumber RUN as a factor, one level per merged run.
input[, RUN := factor(newRun, levels = unique(newRun),
labels = seq_along(unique(newRun)))]
# Drop the temporary newRun helper column.
data.table::set(input, j = "newRun", value = NULL)
}
}
}
Expand Down
77 changes: 45 additions & 32 deletions R/utils_output.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,31 @@
#' output = output = MSstatsSummarizationOutput(input, summarized, processed,
#' method, impute, cens)
#'
MSstatsSummarizationOutput = function(input, summarized, processed,
MSstatsSummarizationOutput = function(input, summarized, processed,
method, impute, censored_symbol) {
LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL

input = .finalizeInput(input, summarized, method, impute, censored_symbol)
summarized = lapply(summarized, function(x) x[[1]])
summarized = data.table::rbindlist(summarized, fill = TRUE)
if (inherits(summarized, "try-error")) {

# Summarization failure is signalled by a NULL `summarized` (the caller
# wraps the per-subplot summarization in tryCatch(error = ...) returning
# NULL). Guard before unpacking, otherwise .finalizeInput() below joins on
# an empty predicted_survival and errors.
if (is.null(summarized)) {
msg = paste("*** error : can't summarize per subplot with ",
method, ".")
getOption("MSstatsLog")("ERROR", msg)
getOption("MSstatsMsg")("ERROR", msg)
rqall = NULL
rqmodelqc = NULL
workpred = NULL
} else {
predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
fill = TRUE)
for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
rm(predicted_survival)
protein_summaries = lapply(summarized, function(x) x[[1]])
rm(summarized)
summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
rm(protein_summaries)

input[, TotalGroupMeasurements := uniqueN(.SD),
by = c("PROTEIN", "GROUP", "LABEL"),
.SDcols = c("FEATURE", "originalRUN")]
Expand All @@ -67,10 +76,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
by.y = c(merge_col, "PROTEIN", "LABEL"))
data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
c("GROUP", "SUBJECT"), skip_absent = TRUE)

rqall$GROUP = factor(as.character(rqall$GROUP))
rqall$Protein = factor(rqall$Protein)
rqmodelqc = summarized$ModelQC
}

if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) {
Expand All @@ -82,18 +90,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
"originalRUN", "censored", "INTENSITY", "ABUNDANCE",
"newABUNDANCE", "predicted", "feature_quality",
"is_outlier", "remove", "is_labeled_ref"), colnames(input))
input = input[, output_cols, with = FALSE]

drop_cols = setdiff(colnames(input), output_cols)
for (col in drop_cols) data.table::set(input, j = col, value = NULL)

if (is.element("remove", colnames(processed))) {
processed = processed[(remove),
intersect(output_cols,
processed = processed[(remove),
intersect(output_cols,
colnames(processed)), with = FALSE]
input = rbind(input, processed, fill = TRUE)
}
list(FeatureLevelData = as.data.frame(input),
ProteinLevelData = as.data.frame(rqall),
data.table::setDF(input)
if (!is.null(rqall)) data.table::setDF(rqall)
list(FeatureLevelData = input,
ProteinLevelData = rqall,
SummaryMethod = method)

}


Expand All @@ -104,9 +115,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
#' @param impute if TRUE, censored missing values were imputed
#' @param censored_symbol censored missing value indicator
#' @keywords internal
.finalizeInput = function(input, summarized, method, impute, censored_symbol) {
.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) {
# if (method == "TMP") {
input = .finalizeTMP(input, censored_symbol, impute, summarized)
input = .finalizeTMP(input, censored_symbol, impute, predicted_survival)
# } else {
# input = .finalizeLinear(input, censored_symbol)
# }
Expand All @@ -117,21 +128,23 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
#' Summary statistics for output of TMP-based summarization
#' @inheritParams .finalizeInput
#' @keywords internal
.finalizeTMP = function(input, censored_symbol, impute, summarized) {
.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) {
NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL
total_features = more50missing = nonmissing_orig = censored = NULL
INTENSITY = newABUNDANCE = NumImputedFeature = NULL

survival_predictions = lapply(summarized, function(x) x[[2]])
predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE)

if (impute) {
cols = intersect(colnames(input), c("newABUNDANCE",
"cen", "RUN",
"FEATURE", "ref_covariate", "LABEL"))
input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
predicted_survival,
by = setdiff(cols, "newABUNDANCE"),
all.x = TRUE)
join_cols = intersect(intersect(colnames(input),
colnames(predicted_survival)),
c("cen", "RUN", "FEATURE", "ref_covariate",
"LABEL"))
data.table::set(input, j = "newABUNDANCE", value = NULL)
idx = predicted_survival[input, on = join_cols, which = TRUE,
mult = "first"]
data.table::set(input, j = "newABUNDANCE",
value = predicted_survival$newABUNDANCE[idx])
data.table::set(input, j = "predicted",
value = predicted_survival$predicted[idx])
}
input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
input[, NumMeasuredFeature := sum(NonMissingStats),
Expand All @@ -144,7 +157,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
} else {
input[, nonmissing_orig := !is.na(INTENSITY)]
}
input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
if (impute) {
input[, NumImputedFeature := sum(!nonmissing_orig),
by = c("PROTEIN", "RUN", "LABEL")]
Expand Down Expand Up @@ -175,7 +188,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
} else {
input[, nonmissing_orig := !is.na(INTENSITY)]
}
input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
input[, NumImputedFeature := 0]
}
input
Expand Down
Loading
Loading