diff --git a/bin/coverage_stats_dr_regions.py b/bin/coverage_stats_dr_regions.py new file mode 100644 index 00000000..66d38a75 --- /dev/null +++ b/bin/coverage_stats_dr_regions.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import re +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert samtools bedcov output into one-row DR gene coverage TSV." + ) + parser.add_argument("--sample-name", required=True) + parser.add_argument("--bedcov", required=True, type=Path) + parser.add_argument("--output", required=True, type=Path) + parser.add_argument( + "--gene-name-column", + type=int, + default=4, + help=( + "1-based column index in the BED/bedcov file containing the gene name. " + "For tbprofiler_whov2plus_genes.bed this is column 4." + ), + ) + return parser.parse_args() + + +def clean_column_name(value: str) -> str: + value = value.strip() + value = re.sub(r"[^A-Za-z0-9_.-]+", "_", value) + value = value.strip("_") + return value or "unknown_gene" + + +def main(): + args = parse_args() + + gene_idx = args.gene_name_column - 1 + + header = ["sample"] + values = [args.sample_name] + + with args.bedcov.open() as handle: + reader = csv.reader(handle, delimiter="\t") + + for row in reader: + if not row: + continue + + if len(row) <= gene_idx: + raise ValueError( + f"Expected gene name column {args.gene_name_column}, " + f"but row only has {len(row)} columns: {row}" + ) + + bed_start = int(row[1]) + bed_stop = int(row[2]) + gene_name = clean_column_name(row[gene_idx]) + summed_depth = float(row[-1]) + column_name = f"dr_gene_{gene_name}_mean_depth" + + region_size = bed_stop - bed_start + mean_depth = summed_depth / region_size if region_size > 0 else "NA" + + header.append(f"{gene_name}_mean_depth") + values.append(mean_depth) + + with args.output.open("w", newline="") as handle: + writer = csv.writer(handle, delimiter="\t") + writer.writerow(header) + writer.writerow(values) + + +if __name__ == "__main__": + main() diff --git a/bin/generate_merged_cohort_stats.py b/bin/generate_merged_cohort_stats.py index 83e3c447..218ea1aa 100755 --- a/bin/generate_merged_cohort_stats.py +++ b/bin/generate_merged_cohort_stats.py @@ -58,9 +58,50 @@ # Reorder the columns df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip() - new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET'] - df_final_cohort_stats = df_joint_cohort_stats[new_cols] + # Reorder the columns + df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip() + base_cols = [ + 'AVG_INSERT_SIZE', + 'MAPPED_PERCENTAGE', + 'RAW_TOTAL_SEQS', + 'AVERAGE_BASE_QUALITY', + 'MEAN_COVERAGE', + 'SD_COVERAGE', + 'MEDIAN_COVERAGE', + 'MAD_COVERAGE', + 'PCT_EXC_ADAPTER', + 'PCT_EXC_MAPQ', + 'PCT_EXC_DUPE', + 'PCT_EXC_UNPAIRED', + 'PCT_EXC_BASEQ', + 'PCT_EXC_OVERLAP', + 'PCT_EXC_CAPPED', + 'PCT_EXC_TOTAL', + 'PCT_1X', + 'PCT_5X', + 'PCT_10X', + 'PCT_30X', + 'PCT_50X', + 'PCT_100X', + 'LINEAGES', + 'FREQUENCIES', + 'MAPPED_NTM_FRACTION_16S', + 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', + 'COVERAGE_THRESHOLD_MET', + 'BREADTH_OF_COVERAGE_THRESHOLD_MET', + 'RELABUNDANCE_THRESHOLD_MET', + ] + + dr_cols = [ + col for col in df_joint_cohort_stats.columns + if col.startswith('dr_gene_') or col.startswith('dr_region_') + ] + + new_cols = base_cols + dr_cols + ['ALL_THRESHOLDS_MET'] + + df_final_cohort_stats = df_joint_cohort_stats[new_cols] + # Impute the NaN value after join df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0) @@ -80,4 +121,4 @@ df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0}) # Write the final dataframe to file - df_final_cohort_stats.to_csv(args['output_file'], sep="\t") \ No newline at end of file + df_final_cohort_stats.to_csv(args['output_file'], sep="\t") diff --git a/bin/merge_sample_stats_dr_coverage.py b/bin/merge_sample_stats_dr_coverage.py new file mode 100644 index 00000000..3b487d24 --- /dev/null +++ b/bin/merge_sample_stats_dr_coverage.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path + +import pandas as pd + + +SAMPLE_ID_COLUMNS = [ + "sample", + "sampleName", + "sample_name", + "sample_id", + "Sample", + "SampleName", +] + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "Merge a one-row MAGMA sample stats TSV with a one-row " + "DR-region coverage TSV." + ) + ) + + parser.add_argument( + "--sample-name", + required=True, + help="Sample name used for validation and output naming.", + ) + parser.add_argument( + "--sample-stats", + required=True, + type=Path, + help="Input sample stats TSV.", + ) + parser.add_argument( + "--dr-coverage", + required=True, + type=Path, + help="Input DR-region coverage TSV.", + ) + parser.add_argument( + "--output", + required=True, + type=Path, + help="Output merged sample stats TSV.", + ) + + return parser.parse_args() + + +def read_one_row_tsv(path: Path, label: str) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"{label} file does not exist: {path}") + + df = pd.read_csv(path, sep="\t") + + if len(df.index) != 1: + raise ValueError( + f"Expected {label} file to contain exactly one row, " + f"but found {len(df.index)} rows: {path}" + ) + + return df + + +def main(): + args = parse_args() + + sample_stats = pd.read_csv(args.sample_stats, sep="\t", header=None) + dr_coverage = pd.read_csv(args.dr_coverage, sep="\t") + + # The DR coverage file may include a sample-identifying first column. + # Drop it before column-wise concatenation to avoid duplicate IDs. + dr_coverage = dr_coverage.drop( + columns=[col for col in SAMPLE_ID_COLUMNS if col in dr_coverage.columns] + ) + + overlapping_columns = set(sample_stats.columns).intersection(dr_coverage.columns) + if overlapping_columns: + raise ValueError( + "Refusing to merge because these DR coverage columns already exist " + "in the sample stats file: " + + ", ".join(sorted(overlapping_columns)) + ) + + merged = pd.concat( + [ + sample_stats.reset_index(drop=True), + dr_coverage.reset_index(drop=True), + ], + axis=1, + ) + + args.output.parent.mkdir(parents=True, exist_ok=True) + merged.to_csv(args.output, sep="\t", index=False, header=False) + + +if __name__ == "__main__": + main() diff --git a/default_params.config b/default_params.config index 00f55d1d..7d7b42fe 100644 --- a/default_params.config +++ b/default_params.config @@ -799,3 +799,23 @@ CLUSTERPICKER { algorithm = 'gap' } + +//----------------------- +// Processes used in DR_COVERAGE_QCs +//----------------------- + +SAMTOOLS_COVERAGE_STATS_DR_REGIONS { + results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/" + should_publish = true +} + +UTILS_FORMAT_DR_REGION_COVERAGE { + results_dir = "${params.outdir}/QC_statistics/per_sample/dr_coverage/" + should_publish = true +} + +UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE { + results_dir = "${params.outdir}/QC_statistics/samples_thresholds/" + should_publish = false +} + diff --git a/modules/local/samtools/coverage_stats_dr_regions.nf b/modules/local/samtools/coverage_stats_dr_regions.nf new file mode 100644 index 00000000..a748cf86 --- /dev/null +++ b/modules/local/samtools/coverage_stats_dr_regions.nf @@ -0,0 +1,17 @@ +process SAMTOOLS_COVERAGE_STATS_DR_REGIONS { + tag "${sampleName}" + label 'cpu_2_memory_2' + publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish + + input: + tuple val(sampleName), path(bam), path(bai) + path regions + + output: + tuple val(sampleName), path("${sampleName}.dr_region_coverage.long.tsv"), emit: bedcov + + script: + """ + ${params.samtools_path} bedcov ${regions} ${bam} > ${sampleName}.dr_region_coverage.long.tsv + """ +} diff --git a/modules/local/utils/cohort_stats.nf b/modules/local/utils/cohort_stats.nf index 02c9c507..ad1888d2 100644 --- a/modules/local/utils/cohort_stats.nf +++ b/modules/local/utils/cohort_stats.nf @@ -29,14 +29,23 @@ process UTILS_COHORT_STATS { input: path("sample_stats/*") + path drRegionsBed output: path("*.cohort_stats.tsv") - shell: - ''' - echo -e "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET" > !{params.vcf_name}.cohort_stats.tsv - cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv - ''' + ''' + { + printf "SAMPLE\tAVG_INSERT_SIZE\tMAPPED_PERCENTAGE\tRAW_TOTAL_SEQS\tAVERAGE_BASE_QUALITY\tMEAN_COVERAGE\tSD_COVERAGE\tMEDIAN_COVERAGE\tMAD_COVERAGE\tPCT_EXC_ADAPTER\tPCT_EXC_MAPQ\tPCT_EXC_DUPE\tPCT_EXC_UNPAIRED\tPCT_EXC_BASEQ\tPCT_EXC_OVERLAP\tPCT_EXC_CAPPED\tPCT_EXC_TOTAL\tPCT_1X\tPCT_5X\tPCT_10X\tPCT_30X\tPCT_50X\tPCT_100X\tMAPPED_NTM_FRACTION_16S\tMAPPED_NTM_FRACTION_16S_THRESHOLD_MET\tCOVERAGE_THRESHOLD_MET\tBREADTH_OF_COVERAGE_THRESHOLD_MET\tALL_THRESHOLDS_MET"; + + awk '{ + printf "\tdr_gene_%s_mean_depth", $5 + }' !{drRegionsBed}; + + printf "\n"; + } > !{params.vcf_name}.cohort_stats.tsv + + cat sample_stats/*tsv >> !{params.vcf_name}.cohort_stats.tsv + ''' } diff --git a/modules/local/utils/format_dr_region_coverage.nf b/modules/local/utils/format_dr_region_coverage.nf new file mode 100644 index 00000000..b362ace5 --- /dev/null +++ b/modules/local/utils/format_dr_region_coverage.nf @@ -0,0 +1,20 @@ +process UTILS_FORMAT_DR_REGION_COVERAGE { + tag "${sampleName}" + label 'cpu_2_memory_2' + publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish + + input: + tuple val(sampleName), path(bedcovTsv) + + output: + tuple val(sampleName), path("${sampleName}.dr_region_coverage.tsv"), emit: coverage + + script: + """ + coverage_stats_dr_regions.py \\ + --sample-name ${sampleName} \\ + --bedcov ${bedcovTsv} \\ + --gene-name-column 5 \\ + --output ${sampleName}.dr_region_coverage.tsv + """ +} diff --git a/modules/local/utils/merge_sample_stats_dr_coverage.nf b/modules/local/utils/merge_sample_stats_dr_coverage.nf new file mode 100644 index 00000000..f4b21ce0 --- /dev/null +++ b/modules/local/utils/merge_sample_stats_dr_coverage.nf @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648 + * + * This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA + * + * For quick overview of GPL-3 license, please refer + * https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3 + * + * - You MUST keep this license with original authors in your copy + * - You MUST acknowledge the original source of this software + * - You MUST state significant changes made to the original software + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program . If not, see . + */ + +process UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE { + tag "${sampleName}" + publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish + + input: + tuple val(sampleName), path(sampleStats), path(drCoverage) + + output: + path("${sampleName}.stats.tsv") + + script: + """ + merge_sample_stats_dr_coverage.py \\ + --sample-name ${sampleName} \\ + --sample-stats ${sampleStats} \\ + --dr-coverage ${drCoverage} \\ + --output ${sampleName}.stats.tsv + """ +} diff --git a/modules/local/utils/sample_stats.nf b/modules/local/utils/sample_stats.nf index 8972ef47..003713e7 100644 --- a/modules/local/utils/sample_stats.nf +++ b/modules/local/utils/sample_stats.nf @@ -25,13 +25,13 @@ */ process UTILS_SAMPLE_STATS { tag "${sampleName}" - publishDir params.results_dir, mode: params.save_mode, enabled: params.should_publish + publishDir params.results_dir, mode: params.save_mode, enabled: false input: tuple val(sampleName), path(samtoolsStats), path(wgsMetrics), path(flagStats), path(ntmFraction) output: - path("*.stats.tsv") + tuple val(sampleName), path("*.stats.tsv"), emit: stats script: """ @@ -45,5 +45,4 @@ process UTILS_SAMPLE_STATS { --cutoff_breadth_of_coverage ${params.cutoff_breadth_of_coverage} \\ --cutoff_ntm_fraction ${params.cutoff_ntm_fraction} """ - } diff --git a/resources/regions/WHO_Tier1_Tier2_DR.bed b/resources/regions/WHO_Tier1_Tier2_DR.bed new file mode 100644 index 00000000..48dd914f --- /dev/null +++ b/resources/regions/WHO_Tier1_Tier2_DR.bed @@ -0,0 +1,49 @@ +NC-000962-3-H37Rv 5239 7267 +NC-000962-3-H37Rv 7301 9818 +NC-000962-3-H37Rv 490782 491793 +NC-000962-3-H37Rv 575347 576790 +NC-000962-3-H37Rv 619890 620865 +NC-000962-3-H37Rv 759806 763325 +NC-000962-3-H37Rv 763369 767320 +NC-000962-3-H37Rv 775585 778480 +NC-000962-3-H37Rv 778476 778905 +NC-000962-3-H37Rv 778989 779487 +NC-000962-3-H37Rv 781559 781934 +NC-000962-3-H37Rv 800808 801462 +NC-000962-3-H37Rv 1302930 1305501 +NC-000962-3-H37Rv 1406080 1407340 +NC-000962-3-H37Rv 1416180 1417347 +NC-000962-3-H37Rv 1461044 1461290 +NC-000962-3-H37Rv 1471845 1473382 +NC-000962-3-H37Rv 1473657 1476795 +NC-000962-3-H37Rv 1674201 1675011 +NC-000962-3-H37Rv 1917939 1918746 +NC-000962-3-H37Rv 2101650 2103042 +NC-000962-3-H37Rv 2153888 2156111 +NC-000962-3-H37Rv 2167648 2170612 +NC-000962-3-H37Rv 2221718 2223164 +NC-000962-3-H37Rv 2288680 2289241 +NC-000962-3-H37Rv 2714123 2715332 +NC-000962-3-H37Rv 2726192 2726780 +NC-000962-3-H37Rv 2859299 2860418 +NC-000962-3-H37Rv 3064514 3066191 +NC-000962-3-H37Rv 3339117 3339762 +NC-000962-3-H37Rv 3448503 3449991 +NC-000962-3-H37Rv 3474006 3475377 +NC-000962-3-H37Rv 3568400 3568679 +NC-000962-3-H37Rv 3611958 3613116 +NC-000962-3-H37Rv 3640542 3641538 +NC-000962-3-H37Rv 3641534 3642881 +NC-000962-3-H37Rv 3877463 3878507 +NC-000962-3-H37Rv 3986843 3987299 +NC-000962-3-H37Rv 4038157 4040704 +NC-000962-3-H37Rv 4043861 4044281 +NC-000962-3-H37Rv 4239862 4243147 +NC-000962-3-H37Rv 4243232 4246517 +NC-000962-3-H37Rv 4246513 4249810 +NC-000962-3-H37Rv 4266952 4268836 +NC-000962-3-H37Rv 4268924 4269833 +NC-000962-3-H37Rv 4326003 4327473 +NC-000962-3-H37Rv 4327548 4328199 +NC-000962-3-H37Rv 4338170 4338521 +NC-000962-3-H37Rv 4407527 4408202 diff --git a/resources/regions/tbprofiler_whov2plus_genes.bed b/resources/regions/tbprofiler_whov2plus_genes.bed new file mode 100644 index 00000000..cb6a2186 --- /dev/null +++ b/resources/regions/tbprofiler_whov2plus_genes.bed @@ -0,0 +1,74 @@ +NC-000962-3-H37Rv 1 1524 Rv0001 dnaA isoniazid +NC-000962-3-H37Rv 4933 7267 Rv0005 gyrB levofloxacin,moxifloxacin +NC-000962-3-H37Rv 7068 9818 Rv0006 gyrA levofloxacin,moxifloxacin +NC-000962-3-H37Rv 13133 13911 Rv0010c Rv0010c isoniazid +NC-000962-3-H37Rv 490545 491793 Rv0407 fgd1 clofazimine,delamanid,pretomanid +NC-000962-3-H37Rv 574479 576790 Rv0486 mshA ethionamide,isoniazid,prothionamide +NC-000962-3-H37Rv 619500 620865 Rv0529 ccsA amikacin,capreomycin,kanamycin +NC-000962-3-H37Rv 656010 657739 Rv0565c Rv0565c ethionamide,prothionamide +NC-000962-3-H37Rv 731680 732406 Rv0635 hadA isoniazid +NC-000962-3-H37Rv 733853 734970 Rv0639 nusG rifampicin,rifapentine +NC-000962-3-H37Rv 737268 738349 Rv0643c mmaA3 isoniazid +NC-000962-3-H37Rv 759342 763325 Rv0667 rpoB rifampicin,rifapentine +NC-000962-3-H37Rv 763127 767320 Rv0668 rpoC rifampicin,rifapentine +NC-000962-3-H37Rv 775586 778680 Rv0676c mmpL5 bedaquiline,clofazimine +NC-000962-3-H37Rv 778477 779624 Rv0677c mmpS5 bedaquiline,clofazimine +NC-000962-3-H37Rv 778790 779487 Rv0678 mmpR5 bedaquiline,clofazimine +NC-000962-3-H37Rv 781126 781934 Rv0682 rpsL streptomycin +NC-000962-3-H37Rv 800106 801462 Rv0701 rplC linezolid +NC-000962-3-H37Rv 1253074 1254783 Rv1129c Rv1129c isoniazid,levofloxacin,moxifloxacin,rifampicin,rifapentine +NC-000962-3-H37Rv 1302606 1305501 Rv1173 fbiC clofazimine,delamanid,pretomanid +NC-000962-3-H37Rv 1364162 1365186 Rv1221 sigE pyrazinamide +NC-000962-3-H37Rv 1406081 1407604 Rv1258c Rv1258c isoniazid,pyrazinamide,streptomycin +NC-000962-3-H37Rv 1416181 1418048 Rv1267c embR ethambutol +NC-000962-3-H37Rv 1460802 1461290 Rv1305 atpE bedaquiline +NC-000962-3-H37Rv 1471498 1473382 EBG00000313325 rrs amikacin,capreomycin,kanamycin,streptomycin +NC-000962-3-H37Rv 1473408 1476795 EBG00000313339 rrl capreomycin,linezolid +NC-000962-3-H37Rv 1673148 1675011 Rv1484 inhA ethionamide,isoniazid,prothionamide +NC-000962-3-H37Rv 1833247 1834987 Rv1630 rpsA pyrazinamide +NC-000962-3-H37Rv 1853358 1854388 Rv1644 tsnR linezolid +NC-000962-3-H37Rv 1917506 1918746 Rv1694 tlyA capreomycin +NC-000962-3-H37Rv 2062809 2065010 Rv1819c bacA amikacin,capreomycin,kanamycin,streptomycin +NC-000962-3-H37Rv 2101651 2103337 Rv1854c ndh delamanid,ethionamide,isoniazid,prothionamide +NC-000962-3-H37Rv 2153889 2156842 Rv1908c katG isoniazid +NC-000962-3-H37Rv 2167649 2170934 Rv1918c PPE35 pyrazinamide +NC-000962-3-H37Rv 2221719 2223825 Rv1979c Rv1979c bedaquiline,clofazimine +NC-000962-3-H37Rv 2288681 2290323 Rv2043c pncA pyrazinamide +NC-000962-3-H37Rv 2517915 2519365 Rv2245 kasA isoniazid +NC-000962-3-H37Rv 2714124 2715832 Rv2416c eis amikacin,kanamycin +NC-000962-3-H37Rv 2725899 2726780 Rv2428 ahpC isoniazid +NC-000962-3-H37Rv 2746135 2747798 Rv2447c folC para-aminosalicylic_acid +NC-000962-3-H37Rv 2782366 2786169 Rv2477c Rv2477c amikacin,ethambutol,kanamycin,levofloxacin,moxifloxacin,rifampicin,rifapentine,streptomycin +NC-000962-3-H37Rv 2859300 2860640 Rv2535c pepQ bedaquiline,clofazimine +NC-000962-3-H37Rv 2986639 2987615 Rv2671 ribD para-aminosalicylic_acid +NC-000962-3-H37Rv 2995772 2996737 Rv2680 Rv2680 capreomycin +NC-000962-3-H37Rv 2996539 2998055 Rv2681 Rv2681 capreomycin +NC-000962-3-H37Rv 3064515 3067372 Rv2752c Rv2752c ethambutol,isoniazid,levofloxacin,moxifloxacin,rifampicin,rifapentine +NC-000962-3-H37Rv 3067193 3068161 Rv2754c thyX para-aminosalicylic_acid +NC-000962-3-H37Rv 3073680 3074671 Rv2764c thyA para-aminosalicylic_acid +NC-000962-3-H37Rv 3086620 3087935 Rv2780 ald cycloserine +NC-000962-3-H37Rv 3338868 3339762 Rv2983 fbiD clofazimine,delamanid,pretomanid +NC-000962-3-H37Rv 3448253 3449991 Rv3083 Rv3083 ethionamide,prothionamide +NC-000962-3-H37Rv 3568401 3569280 Rv3197A whiB7 amikacin,kanamycin,streptomycin +NC-000962-3-H37Rv 3611959 3613847 Rv3236c Rv3236c pyrazinamide +NC-000962-3-H37Rv 3623159 3625110 Rv3244c lpqB bedaquiline,rifampicin,rifapentine +NC-000962-3-H37Rv 3624910 3626860 Rv3245c mtrB bedaquiline,rifampicin,rifapentine +NC-000962-3-H37Rv 3626663 3627924 Rv3246c mtrA bedaquiline,rifampicin,rifapentine +NC-000962-3-H37Rv 3640207 3641538 Rv3261 fbiA clofazimine,delamanid,pretomanid +NC-000962-3-H37Rv 3641335 3642881 Rv3262 fbiB clofazimine,delamanid,pretomanid +NC-000962-3-H37Rv 3840194 3841548 Rv3423c alr cycloserine +NC-000962-3-H37Rv 3877464 3879240 Rv3457c rpoA rifampicin,rifapentine +NC-000962-3-H37Rv 3986612 3987299 Rv3547 ddn delamanid,pretomanid +NC-000962-3-H37Rv 4038158 4041013 Rv3596c clpC1 pyrazinamide +NC-000962-3-H37Rv 4043862 4046428 Rv3601c panD pyrazinamide +NC-000962-3-H37Rv 4138202 4140002 Rv3696c glpK ethambutol,isoniazid,levofloxacin,moxifloxacin,rifampicin,rifapentine,streptomycin +NC-000962-3-H37Rv 4237683 4243147 Rv3793 embC ethambutol +NC-000962-3-H37Rv 4242947 4246517 Rv3794 embA ethambutol +NC-000962-3-H37Rv 4246314 4249810 Rv3795 embB ethambutol +NC-000962-3-H37Rv 4266953 4269124 Rv3805c aftB ethambutol +NC-000962-3-H37Rv 4268925 4270084 Rv3806c ubiA ethambutol +NC-000962-3-H37Rv 4326004 4330174 Rv3854c ethA ethionamide,prothionamide +NC-000962-3-H37Rv 4327328 4328199 Rv3855 ethR ethionamide,prothionamide +NC-000962-3-H37Rv 4338171 4338961 Rv3862c whiB6 amikacin,capreomycin,kanamycin +NC-000962-3-H37Rv 4407528 4408481 Rv3919c gid streptomycin +NC-000962-3-H37Rv 4407528 4408481 Rv0667-RRDR-region rpoB-rrdr rifampicin,rifapentine diff --git a/subworkflows/local/sample_stats_with_dr_coverage.nf b/subworkflows/local/sample_stats_with_dr_coverage.nf new file mode 100644 index 00000000..50139336 --- /dev/null +++ b/subworkflows/local/sample_stats_with_dr_coverage.nf @@ -0,0 +1,42 @@ +/* + * Copyright ... + */ + +include { UTILS_SAMPLE_STATS as UTILS_RAW_SAMPLE_STATS } from '../../modules/local/utils/sample_stats' addParams( params.UTILS_SAMPLE_STATS ) +include { SAMTOOLS_COVERAGE_STATS_DR_REGIONS } from '../../modules/local/samtools/coverage_stats_dr_regions' addParams( params.SAMTOOLS_COVERAGE_STATS_DR_REGIONS ) +include { UTILS_FORMAT_DR_REGION_COVERAGE } from '../../modules/local/utils/format_dr_region_coverage' addParams( params.UTILS_FORMAT_DR_REGION_COVERAGE ) +include { UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE } from '../../modules/local/utils/merge_sample_stats_dr_coverage' addParams( params.UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE ) + +workflow UTILS_SAMPLE_STATS { + + take: + ch_sample_stats_input + ch_bam + + main: + ch_dr_regions = Channel.value( + file("${projectDir}/resources/regions/tbprofiler_whov2plus_genes.bed") + ) + + UTILS_RAW_SAMPLE_STATS(ch_sample_stats_input) + + SAMTOOLS_COVERAGE_STATS_DR_REGIONS( + ch_bam, + ch_dr_regions + ) + + UTILS_FORMAT_DR_REGION_COVERAGE( + SAMTOOLS_COVERAGE_STATS_DR_REGIONS.out.bedcov + ) + + ch_merged_input = UTILS_RAW_SAMPLE_STATS.out.stats + .join(UTILS_FORMAT_DR_REGION_COVERAGE.out.coverage) + .map { sampleName, sampleStats, drCoverage -> + tuple(sampleName, sampleStats, drCoverage) + } + +UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE(ch_merged_input) + + emit: + UTILS_MERGE_SAMPLE_STATS_DR_COVERAGE.out +} diff --git a/workflows/call_wf.nf b/workflows/call_wf.nf index f628b892..ff35ab46 100644 --- a/workflows/call_wf.nf +++ b/workflows/call_wf.nf @@ -39,7 +39,7 @@ include { LOFREQ_FILTER } from "../modules/local/lofreq/filter.nf" addParams ( p include { SAMTOOLS_STATS } from "../modules/local/samtools/stats.nf" addParams ( params.SAMTOOLS_STATS ) include { GATK_COLLECT_WGS_METRICS } from "../modules/local/gatk/collect_wgs_metrics.nf" addParams ( params.GATK_COLLECT_WGS_METRICS ) include { GATK_FLAG_STAT } from "../modules/local/gatk/flag_stat.nf" addParams ( params.GATK_FLAG_STAT ) -include { UTILS_SAMPLE_STATS } from "../modules/local/utils/sample_stats.nf" addParams ( params.UTILS_SAMPLE_STATS ) +include { UTILS_SAMPLE_STATS } from '../subworkflows/local/sample_stats_with_dr_coverage' include { UTILS_COHORT_STATS } from "../modules/local/utils/cohort_stats.nf" addParams ( params.UTILS_COHORT_STATS ) include { UTILS_REFORMAT_LOFREQ } from "../modules/local/utils/reformat_lofreq.nf" addParams ( params.UTILS_REFORMAT_LOFREQ ) include { GATK_INDEX_FEATURE_FILE as GATK_INDEX_FEATURE_FILE__LOFREQ } from "../modules/local/gatk/index_feature_file.nf" addParams ( params.GATK_INDEX_FEATURE_FILE__LOFREQ ) @@ -145,24 +145,33 @@ workflow CALL_WF { //---------------------------------------------------------------------------------- // STATS //---------------------------------------------------------------------------------- - - // call_stats SAMTOOLS_STATS(recalibrated_bam_ch, params.ref_fasta) GATK_COLLECT_WGS_METRICS(recalibrated_bam_ch, params.ref_fasta) GATK_FLAG_STAT(recalibrated_bam_ch, params.ref_fasta, [params.ref_fasta_fai, params.ref_fasta_dict]) - - + sample_stats_ch = (SAMTOOLS_STATS.out) .join(GATK_COLLECT_WGS_METRICS.out) .join(GATK_FLAG_STAT.out) .join(LOFREQ_CALL__NTM.out) - //.dump(tag: "CALL_WF sample_stats_ch : ", pretty: true) - - - UTILS_SAMPLE_STATS(sample_stats_ch) - - UTILS_COHORT_STATS(UTILS_SAMPLE_STATS.out.collect()) + + dr_coverage_bam_ch = SAMTOOLS_INDEX.out.map { sampleName, bai, bam -> + tuple(sampleName, bam, bai) + } + + UTILS_SAMPLE_STATS( + sample_stats_ch, + dr_coverage_bam_ch + ) + + ch_dr_regions = Channel.value( + file("${projectDir}/resources/regions/tbprofiler_whov2plus_genes.bed") + ) + + UTILS_COHORT_STATS( + UTILS_SAMPLE_STATS.out.collect(), + ch_dr_regions + ) emit: cohort_stats_tsv = UTILS_COHORT_STATS.out