Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
5aa5648
Create coverage_stats_dr_regions
vrennie May 19, 2026
6f38150
Update sample_stats.nf
vrennie May 19, 2026
0652909
Create sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
9b7dc6a
Update sample_stats.nf
vrennie May 19, 2026
541b34f
Rename coverage_stats_dr_regions to coverage_stats_dr_regions.nf
vrennie May 19, 2026
585b051
Create merge_sample_stats_dr_coverage.py
vrennie May 19, 2026
1d85510
Create merge_sample_stats_dr_coverage.nf
vrennie May 19, 2026
66ebd07
Update call_wf.nf
vrennie May 19, 2026
d42599a
Update call_wf.nf
vrennie May 19, 2026
ccbc737
Update call_wf.nf
vrennie May 19, 2026
7859ac6
Update call_wf.nf
vrennie May 19, 2026
40d87c4
Update sample_stats.nf
vrennie May 19, 2026
0c8f8f9
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
3a6c7fe
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
98675e5
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
61aa51a
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
a86e7f0
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
b7c3710
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
a18c679
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
5c3be7e
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
d1970ae
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
66edc69
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
dbba54e
Create WHO_Tier1_Tier2_DR.bed
vrennie May 19, 2026
a522136
Delete resources/regions/resources/regions directory
vrennie May 19, 2026
c14738d
Create WHO_Tier1_Tier2_DR.bed
vrennie May 19, 2026
db22bc0
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
e6c9de5
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
e5ddf19
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
d127a0b
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
9cb0d84
Create coverage_stats_dr_regions.py
vrennie May 19, 2026
400c1c8
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
17445ed
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
15c3ec9
Update coverage_stats_dr_regions.nf
vrennie May 19, 2026
9c617db
Create format_dr_region_coverage.nf
vrennie May 19, 2026
e02d2b9
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
c181039
Update default_params.config
vrennie May 19, 2026
f0aa602
Update default_params.config
vrennie May 19, 2026
e01b1e4
Update default_params.config
vrennie May 19, 2026
cd830ad
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
ad66755
Update merge_sample_stats_dr_coverage.py
vrennie May 19, 2026
25f3e2c
Update merge_sample_stats_dr_coverage.py
vrennie May 19, 2026
009fee6
Update cohort_stats.nf
vrennie May 19, 2026
c5fbad9
Add files via upload
vrennie May 19, 2026
284cf98
Update sample_stats_with_dr_coverage.nf
vrennie May 19, 2026
1ebbcd5
Update coverage_stats_dr_regions.py
vrennie May 19, 2026
76fa6dc
Update format_dr_region_coverage.nf
vrennie May 19, 2026
92d0a5b
Update cohort_stats.nf
vrennie May 19, 2026
5e747ea
Update call_wf.nf
vrennie May 19, 2026
9279d9d
Update cohort_stats.nf
vrennie May 19, 2026
200dd18
Update cohort_stats.nf
vrennie May 19, 2026
6f77982
Update coverage_stats_dr_regions.py
vrennie May 20, 2026
2acfa45
Update coverage_stats_dr_regions.py
vrennie May 20, 2026
9999cf5
Update cohort_stats.nf
vrennie May 20, 2026
43a46d4
Update coverage_stats_dr_regions.py
vrennie May 20, 2026
2051043
Update coverage_stats_dr_regions.py
vrennie May 20, 2026
5475cab
Update cohort_stats.nf
vrennie May 20, 2026
2cac554
Update format_dr_region_coverage.nf
vrennie May 20, 2026
f8d9e0a
Update cohort_stats.nf
vrennie May 20, 2026
8d68415
Update sample_stats_with_dr_coverage.nf
vrennie May 20, 2026
ce36e58
Update cohort_stats.nf
vrennie May 20, 2026
f6550e3
Update default_params.config
vrennie May 20, 2026
bdffc7e
Update generate_merged_cohort_stats.py
vrennie May 20, 2026
c1aa979
Update generate_merged_cohort_stats.py
vrennie May 20, 2026
208b727
Update generate_merged_cohort_stats.py
vrennie May 20, 2026
079aa21
Update tbprofiler_whov2plus_genes.bed
vrennie May 20, 2026
ec745de
Update cohort_stats.nf
vrennie May 21, 2026
f997a5c
Update format_dr_region_coverage.nf
vrennie May 21, 2026
e7a6d19
Update coverage_stats_dr_regions.py
vrennie May 21, 2026
973791b
Update tbprofiler_whov2plus_genes.bed
vrennie May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions bin/coverage_stats_dr_regions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python3

import argparse
import csv
import re
from pathlib import Path


def parse_args():
parser = argparse.ArgumentParser(
description="Convert samtools bedcov output into one-row DR gene coverage TSV."
)
parser.add_argument("--sample-name", required=True)
parser.add_argument("--bedcov", required=True, type=Path)
parser.add_argument("--output", required=True, type=Path)
parser.add_argument(
"--gene-name-column",
type=int,
default=4,
help=(
"1-based column index in the BED/bedcov file containing the gene name. "
"For tbprofiler_whov2plus_genes.bed this is column 4."
),
)
return parser.parse_args()


def clean_column_name(value: str) -> str:
value = value.strip()
value = re.sub(r"[^A-Za-z0-9_.-]+", "_", value)
value = value.strip("_")
return value or "unknown_gene"


def main():
args = parse_args()

gene_idx = args.gene_name_column - 1

header = ["sample"]
values = [args.sample_name]

with args.bedcov.open() as handle:
reader = csv.reader(handle, delimiter="\t")

for row in reader:
if not row:
continue

if len(row) <= gene_idx:
raise ValueError(
f"Expected gene name column {args.gene_name_column}, "
f"but row only has {len(row)} columns: {row}"
)

bed_start = int(row[1])
bed_stop = int(row[2])
gene_name = clean_column_name(row[gene_idx])
summed_depth = float(row[-1])
column_name = f"dr_gene_{gene_name}_mean_depth"

region_size = bed_stop - bed_start
mean_depth = summed_depth / region_size if region_size > 0 else "NA"

header.append(f"{gene_name}_mean_depth")
values.append(mean_depth)

with args.output.open("w", newline="") as handle:
writer = csv.writer(handle, delimiter="\t")
writer.writerow(header)
writer.writerow(values)


if __name__ == "__main__":
main()
47 changes: 44 additions & 3 deletions bin/generate_merged_cohort_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,50 @@

# Reorder the columns
df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
df_final_cohort_stats = df_joint_cohort_stats[new_cols]
# Reorder the columns
df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()

base_cols = [
'AVG_INSERT_SIZE',
'MAPPED_PERCENTAGE',
'RAW_TOTAL_SEQS',
'AVERAGE_BASE_QUALITY',
'MEAN_COVERAGE',
'SD_COVERAGE',
'MEDIAN_COVERAGE',
'MAD_COVERAGE',
'PCT_EXC_ADAPTER',
'PCT_EXC_MAPQ',
'PCT_EXC_DUPE',
'PCT_EXC_UNPAIRED',
'PCT_EXC_BASEQ',
'PCT_EXC_OVERLAP',
'PCT_EXC_CAPPED',
'PCT_EXC_TOTAL',
'PCT_1X',
'PCT_5X',
'PCT_10X',
'PCT_30X',
'PCT_50X',
'PCT_100X',
'LINEAGES',
'FREQUENCIES',
'MAPPED_NTM_FRACTION_16S',
'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET',
'COVERAGE_THRESHOLD_MET',
'BREADTH_OF_COVERAGE_THRESHOLD_MET',
'RELABUNDANCE_THRESHOLD_MET',
]

dr_cols = [
col for col in df_joint_cohort_stats.columns
if col.startswith('dr_gene_') or col.startswith('dr_region_')
]

new_cols = base_cols + dr_cols + ['ALL_THRESHOLDS_MET']

df_final_cohort_stats = df_joint_cohort_stats[new_cols]

# Impute the NaN value after join
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)

Expand All @@ -80,4 +121,4 @@
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})

# Write the final dataframe to file
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
102 changes: 102 additions & 0 deletions bin/merge_sample_stats_dr_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3

import argparse
from pathlib import Path

import pandas as pd


SAMPLE_ID_COLUMNS = [
"sample",
"sampleName",
"sample_name",
"sample_id",
"Sample",
"SampleName",
]


def parse_args():
parser = argparse.ArgumentParser(
description=(
"Merge a one-row MAGMA sample stats TSV with a one-row "
"DR-region coverage TSV."
)
)

parser.add_argument(
"--sample-name",
required=True,
help="Sample name used for validation and output naming.",
)
parser.add_argument(
"--sample-stats",
required=True,
type=Path,
help="Input sample stats TSV.",
)
parser.add_argument(
"--dr-coverage",
required=True,
type=Path,
help="Input DR-region coverage TSV.",
)
parser.add_argument(
"--output",
required=True,
type=Path,
help="Output merged sample stats TSV.",
)

return parser.parse_args()


def read_one_row_tsv(path: Path, label: str) -> pd.DataFrame:
if not path.exists():
raise FileNotFoundError(f"{label} file does not exist: {path}")

df = pd.read_csv(path, sep="\t")

if len(df.index) != 1:
raise ValueError(
f"Expected {label} file to contain exactly one row, "
f"but found {len(df.index)} rows: {path}"
)

return df


def main():
args = parse_args()

sample_stats = pd.read_csv(args.sample_stats, sep="\t", header=None)
dr_coverage = pd.read_csv(args.dr_coverage, sep="\t")

# The DR coverage file may include a sample-identifying first column.
# Drop it before column-wise concatenation to avoid duplicate IDs.
dr_coverage = dr_coverage.drop(
columns=[col for col in SAMPLE_ID_COLUMNS if col in dr_coverage.columns]
)

overlapping_columns = set(sample_stats.columns).intersection(dr_coverage.columns)
if overlapping_columns:
raise ValueError(
"Refusing to merge because these DR coverage columns already exist "
"in the sample stats file: "
+ ", ".join(sorted(overlapping_columns))
)

merged = pd.concat(
[
sample_stats.reset_index(drop=True),
dr_coverage.reset_index(drop=True),
],
axis=1,
)

args.output.parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(args.output, sep="\t", index=False, header=False)


if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions default_params.config
Original file line number Diff line number Diff line change
Expand Up @@ -799,3 +799,23 @@ CLUSTERPICKER {
algorithm = 'gap'

}

//-----------------------
// Processes used in DR_COVERAGE_QCs
//-----------------------

SAMTOOLS_COVERAGE_STATS_DR_REGIONS {
results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/"
should_publish = true
}

UTILS_FORMAT_DR_REGION_COVERAGE {
results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/"
should_publish = true
}

UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE {
results_dir = "${params.outdir}/QC_statistics/samples_thresholds/"
should_publish = false
}

17 changes: 17 additions & 0 deletions modules/local/samtools/coverage_stats_dr_regions.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process SAMTOOLS_COVERAGE_STATS_DR_REGIONS {
tag "${sampleName}"
label 'cpu_2_memory_2'
publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish

input:
tuple val(sampleName), path(bam), path(bai)
path regions

output:
tuple val(sampleName), path("${sampleName}.dr_region_coverage.long.tsv"), emit: bedcov

script:
"""
${params.samtools_path} bedcov ${regions} ${bam} > ${sampleName}.dr_region_coverage.long.tsv
"""
}
19 changes: 14 additions & 5 deletions modules/local/utils/cohort_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,23 @@ process UTILS_COHORT_STATS {

input:
path("sample_stats/*")
path drRegionsBed

output:
path("*.cohort_stats.tsv")


shell:
'''
echo -e "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET" > !{params.vcf_name}.cohort_stats.tsv
cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv
'''
'''
{
printf "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET";

awk '{
printf "\tdr_gene_%s_mean_depth", $5
}' !{drRegionsBed};

printf "\n";
} > !{params.vcf_name}.cohort_stats.tsv

cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv
'''
}
20 changes: 20 additions & 0 deletions modules/local/utils/format_dr_region_coverage.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
process UTILS_FORMAT_DR_REGION_COVERAGE {
tag "${sampleName}"
label 'cpu_2_memory_2'
publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish

input:
tuple val(sampleName), path(bedcovTsv)

output:
tuple val(sampleName), path("${sampleName}.dr_region_coverage.tsv"), emit: coverage

script:
"""
coverage_stats_dr_regions.py \\
--sample-name ${sampleName} \\
--bedcov ${bedcovTsv} \\
--gene-name-column 5 \\
--output ${sampleName}.dr_region_coverage.tsv
"""
}
45 changes: 45 additions & 0 deletions modules/local/utils/merge_sample_stats_dr_coverage.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
*
* This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
*
* For quick overview of GPL-3 license, please refer
* https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
*
* - You MUST keep this license with original authors in your copy
* - You MUST acknowledge the original source of this software
* - You MUST state significant changes made to the original software
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program . If not, see <http://www.gnu.org/licenses/>.
*/

process UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE {
tag "${sampleName}"
publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish

input:
tuple val(sampleName), path(sampleStats), path(drCoverage)

output:
path("${sampleName}.stats.tsv")

script:
"""
merge_sample_stats_dr_coverage.py \\
--sample-name ${sampleName} \\
--sample-stats ${sampleStats} \\
--dr-coverage ${drCoverage} \\
--output ${sampleName}.stats.tsv
"""
}
5 changes: 2 additions & 3 deletions modules/local/utils/sample_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
*/
process UTILS_SAMPLE_STATS {
tag "${sampleName}"
publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish
publishDir params.results_dir, mode: params.save_mode, enabled: false

input:
tuple val(sampleName), path(samtoolsStats), path(wgsMetrics), path(flagStats), path(ntmFraction)

output:
path("*.stats.tsv")
tuple val(sampleName), path("*.stats.tsv"), emit: stats

script:
"""
Expand All @@ -45,5 +45,4 @@ process UTILS_SAMPLE_STATS {
--cutoff_breadth_of_coverage ${params.cutoff_breadth_of_coverage} \\
--cutoff_ntm_fraction ${params.cutoff_ntm_fraction}
"""

}
Loading