TORCH-Consortium · vrennie · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/bin/coverage_stats_dr_regions.py b/bin/coverage_stats_dr_regions.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import re
+from pathlib import Path
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Convert samtools bedcov output into one-row DR gene coverage TSV."
+    )
+    parser.add_argument("--sample-name", required=True)
+    parser.add_argument("--bedcov", required=True, type=Path)
+    parser.add_argument("--output", required=True, type=Path)
+    parser.add_argument(
+        "--gene-name-column",
+        type=int,
+        default=4,
+        help=(
+            "1-based column index in the BED/bedcov file containing the gene name. "
+            "For tbprofiler_whov2plus_genes.bed this is column 4."
+        ),
+    )
+    return parser.parse_args()
+
+
+def clean_column_name(value: str) -> str:
+    value = value.strip()
+    value = re.sub(r"[^A-Za-z0-9_.-]+", "_", value)
+    value = value.strip("_")
+    return value or "unknown_gene"
+
+
+def main():
+    args = parse_args()
+
+    gene_idx = args.gene_name_column - 1
+
+    header = ["sample"]
+    values = [args.sample_name]
+
+    with args.bedcov.open() as handle:
+        reader = csv.reader(handle, delimiter="\t")
+
+        for row in reader:
+            if not row:
+                continue
+
+            if len(row) <= gene_idx:
+                raise ValueError(
+                    f"Expected gene name column {args.gene_name_column}, "
+                    f"but row only has {len(row)} columns: {row}"
+                )
+
+            bed_start = int(row[1])
+            bed_stop = int(row[2])
+            gene_name = clean_column_name(row[gene_idx])
+            summed_depth = float(row[-1])
+            column_name = f"dr_gene_{gene_name}_mean_depth"
+
+            region_size = bed_stop - bed_start
+            mean_depth = summed_depth / region_size if region_size > 0 else "NA"
+
+            header.append(f"{gene_name}_mean_depth")
+            values.append(mean_depth)
+
+    with args.output.open("w", newline="") as handle:
+        writer = csv.writer(handle, delimiter="\t")
+        writer.writerow(header)
+        writer.writerow(values)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/generate_merged_cohort_stats.py b/bin/generate_merged_cohort_stats.py
@@ -58,9 +58,50 @@
 
     # Reorder the columns
     df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
-    new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
-    df_final_cohort_stats = df_joint_cohort_stats[new_cols]
+    # Reorder the columns
+    df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
 
+    base_cols = [
+        'AVG_INSERT_SIZE',
+        'MAPPED_PERCENTAGE',
+        'RAW_TOTAL_SEQS',
+        'AVERAGE_BASE_QUALITY',
+        'MEAN_COVERAGE',
+        'SD_COVERAGE',
+        'MEDIAN_COVERAGE',
+        'MAD_COVERAGE',
+        'PCT_EXC_ADAPTER',
+        'PCT_EXC_MAPQ',
+        'PCT_EXC_DUPE',
+        'PCT_EXC_UNPAIRED',
+        'PCT_EXC_BASEQ',
+        'PCT_EXC_OVERLAP',
+        'PCT_EXC_CAPPED',
+        'PCT_EXC_TOTAL',
+        'PCT_1X',
+        'PCT_5X',
+        'PCT_10X',
+        'PCT_30X',
+        'PCT_50X',
+        'PCT_100X',
+        'LINEAGES',
+        'FREQUENCIES',
+        'MAPPED_NTM_FRACTION_16S',
+        'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET',
+        'COVERAGE_THRESHOLD_MET',
+        'BREADTH_OF_COVERAGE_THRESHOLD_MET',
+        'RELABUNDANCE_THRESHOLD_MET',
+    ]
+
+    dr_cols = [
+        col for col in df_joint_cohort_stats.columns
+        if col.startswith('dr_gene_') or col.startswith('dr_region_')
+    ]
+
+    new_cols = base_cols + dr_cols + ['ALL_THRESHOLDS_MET']
+
+    df_final_cohort_stats = df_joint_cohort_stats[new_cols]
+
     # Impute the NaN value after join
     df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)
 
@@ -80,4 +121,4 @@
     df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})
 
     # Write the final dataframe to file
-    df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
+    df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
diff --git a/bin/merge_sample_stats_dr_coverage.py b/bin/merge_sample_stats_dr_coverage.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+
+import pandas as pd
+
+
+SAMPLE_ID_COLUMNS = [
+    "sample",
+    "sampleName",
+    "sample_name",
+    "sample_id",
+    "Sample",
+    "SampleName",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Merge a one-row MAGMA sample stats TSV with a one-row "
+            "DR-region coverage TSV."
+        )
+    )
+
+    parser.add_argument(
+        "--sample-name",
+        required=True,
+        help="Sample name used for validation and output naming.",
+    )
+    parser.add_argument(
+        "--sample-stats",
+        required=True,
+        type=Path,
+        help="Input sample stats TSV.",
+    )
+    parser.add_argument(
+        "--dr-coverage",
+        required=True,
+        type=Path,
+        help="Input DR-region coverage TSV.",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        type=Path,
+        help="Output merged sample stats TSV.",
+    )
+
+    return parser.parse_args()
+
+
+def read_one_row_tsv(path: Path, label: str) -> pd.DataFrame:
+    if not path.exists():
+        raise FileNotFoundError(f"{label} file does not exist: {path}")
+
+    df = pd.read_csv(path, sep="\t")
+
+    if len(df.index) != 1:
+        raise ValueError(
+            f"Expected {label} file to contain exactly one row, "
+            f"but found {len(df.index)} rows: {path}"
+        )
+
+    return df
+
+
+def main():
+    args = parse_args()
+
+    sample_stats = pd.read_csv(args.sample_stats, sep="\t", header=None)
+    dr_coverage = pd.read_csv(args.dr_coverage, sep="\t")
+
+    # The DR coverage file may include a sample-identifying first column.
+    # Drop it before column-wise concatenation to avoid duplicate IDs.
+    dr_coverage = dr_coverage.drop(
+        columns=[col for col in SAMPLE_ID_COLUMNS if col in dr_coverage.columns]
+    )
+
+    overlapping_columns = set(sample_stats.columns).intersection(dr_coverage.columns)
+    if overlapping_columns:
+        raise ValueError(
+            "Refusing to merge because these DR coverage columns already exist "
+            "in the sample stats file: "
+            + ", ".join(sorted(overlapping_columns))
+        )
+
+    merged = pd.concat(
+        [
+            sample_stats.reset_index(drop=True),
+            dr_coverage.reset_index(drop=True),
+        ],
+        axis=1,
+    )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    merged.to_csv(args.output, sep="\t", index=False, header=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/default_params.config b/default_params.config
@@ -799,3 +799,23 @@ CLUSTERPICKER {
     algorithm = 'gap'
 
 }
+
+//-----------------------
+// Processes used in DR_COVERAGE_QCs
+//-----------------------
+
+SAMTOOLS_COVERAGE_STATS_DR_REGIONS {
+    results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/"
+    should_publish = true
+}
+
+UTILS_FORMAT_DR_REGION_COVERAGE {
+    results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/"
+    should_publish = true
+}
+
+UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE {
+    results_dir = "${params.outdir}/QC_statistics/samples_thresholds/"
+    should_publish = false
+}
+
diff --git a/modules/local/samtools/coverage_stats_dr_regions.nf b/modules/local/samtools/coverage_stats_dr_regions.nf
@@ -0,0 +1,17 @@
+process SAMTOOLS_COVERAGE_STATS_DR_REGIONS {
+    tag "${sampleName}"
+    label 'cpu_2_memory_2'
+    publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish
+
+    input:
+        tuple val(sampleName), path(bam), path(bai)
+        path regions
+
+    output:
+        tuple val(sampleName), path("${sampleName}.dr_region_coverage.long.tsv"), emit: bedcov
+
+    script:
+        """
+        ${params.samtools_path} bedcov ${regions} ${bam} > ${sampleName}.dr_region_coverage.long.tsv
+        """
+}
diff --git a/modules/local/utils/cohort_stats.nf b/modules/local/utils/cohort_stats.nf
@@ -29,14 +29,23 @@ process UTILS_COHORT_STATS {
 
     input:
         path("sample_stats/*")
+        path drRegionsBed
 
     output:
         path("*.cohort_stats.tsv")
 
-
     shell:
-        '''
-        echo -e "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET" > !{params.vcf_name}.cohort_stats.tsv
-        cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv
-        '''
+    '''
+    {
+        printf "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET";
+
+        awk '{
+            printf "\tdr_gene_%s_mean_depth", $5
+        }' !{drRegionsBed};
+
+        printf "\n";
+    } > !{params.vcf_name}.cohort_stats.tsv
+
+    cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv
+    '''
 }
diff --git a/modules/local/utils/format_dr_region_coverage.nf b/modules/local/utils/format_dr_region_coverage.nf
@@ -0,0 +1,20 @@
+process UTILS_FORMAT_DR_REGION_COVERAGE {
+    tag "${sampleName}"
+    label 'cpu_2_memory_2'
+    publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish
+
+    input:
+        tuple val(sampleName), path(bedcovTsv)
+
+    output:
+        tuple val(sampleName), path("${sampleName}.dr_region_coverage.tsv"), emit: coverage
+
+    script:
+        """
+        coverage_stats_dr_regions.py \\
+            --sample-name ${sampleName} \\
+            --bedcov ${bedcovTsv} \\
+            --gene-name-column 5 \\
+            --output ${sampleName}.dr_region_coverage.tsv
+        """
+}
diff --git a/modules/local/utils/merge_sample_stats_dr_coverage.nf b/modules/local/utils/merge_sample_stats_dr_coverage.nf
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
+ *
+ * This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
+ *
+ * For quick overview of GPL-3 license, please refer
+ * https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
+ *
+ * - You MUST keep this license with original authors in your copy
+ * - You MUST acknowledge the original source of this software
+ * - You MUST state significant changes made to the original software
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program . If not, see <http://www.gnu.org/licenses/>.
+ */
+
+process UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE {
+    tag "${sampleName}"
+    publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish
+
+    input:
+        tuple val(sampleName), path(sampleStats), path(drCoverage)
+
+    output:
+        path("${sampleName}.stats.tsv")
+
+    script:
+        """
+        merge_sample_stats_dr_coverage.py \\
+            --sample-name ${sampleName} \\
+            --sample-stats ${sampleStats} \\
+            --dr-coverage ${drCoverage} \\
+            --output ${sampleName}.stats.tsv
+        """
+}
diff --git a/modules/local/utils/sample_stats.nf b/modules/local/utils/sample_stats.nf
@@ -25,13 +25,13 @@
  */
 process UTILS_SAMPLE_STATS {
     tag "${sampleName}"
-    publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish
+    publishDir params.results_dir, mode: params.save_mode, enabled: false
 
     input:
         tuple val(sampleName), path(samtoolsStats), path(wgsMetrics), path(flagStats), path(ntmFraction)
 
     output:
-        path("*.stats.tsv")
+        tuple val(sampleName), path("*.stats.tsv"), emit: stats
 
     script:
         """
@@ -45,5 +45,4 @@ process UTILS_SAMPLE_STATS {
             --cutoff_breadth_of_coverage ${params.cutoff_breadth_of_coverage} \\
             --cutoff_ntm_fraction ${params.cutoff_ntm_fraction}
         """
-
 }