From 0f3e9fa10e11d57341350c73c9e96ad7a99e0d99 Mon Sep 17 00:00:00 2001 From: Thanh Lee Date: Tue, 19 May 2026 13:21:51 +0000 Subject: [PATCH] fix(gtdb-parser): respect --use_ncbi_taxonomy in novel-species filter The novel-species filter read the raw gtdb_taxonomy column while --use_ncbi_taxonomy rewrites only the species column. This caused genomes with NCBI-named species but GTDB placeholder taxonomy (e.g. NCBI Shigella flexneri / GTDB s__ECMA0423 sp047199055) to be dropped, ultimately yielding zero S. flexneri in the Escherichia/Shigella build. Switch the filter to read the species column so it sees post-rewrite names. Add na=False to tolerate rows missing a species call, which the species column (unlike gtdb_taxonomy) does not guarantee to populate. --- gambitdb/GtdbSpreadsheetParser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gambitdb/GtdbSpreadsheetParser.py b/gambitdb/GtdbSpreadsheetParser.py index f8bdd8a..bf5946d 100755 --- a/gambitdb/GtdbSpreadsheetParser.py +++ b/gambitdb/GtdbSpreadsheetParser.py @@ -156,10 +156,14 @@ def filter_input_spreadsheet(self, input_spreadsheet_df): input_spreadsheet_df = input_spreadsheet_df[input_spreadsheet_df['contig_count'] <= self.max_contigs] self.stats_contig_count = len(input_spreadsheet_df.index) - # filter spreadsheet so that if the gtdb_taxonomy column ends with ' sp' followed by digits, then remove the row + # filter spreadsheet so that if the species column ends with ' sp' followed by digits, then remove the row # These are novel species that GTDB has made up that dont exist in NCBI. + # We check 'species' (not 'gtdb_taxonomy') so that --use_ncbi_taxonomy is honoured: once + # the species column has been rewritten to NCBI names, placeholder GTDB clades (e.g. + # "ECMA0423 sp047199055") will no longer match and genuine NCBI species (e.g. "Shigella + # flexneri") won't be incorrectly dropped. if not self.include_novel_species: - input_spreadsheet_df = input_spreadsheet_df[~input_spreadsheet_df['gtdb_taxonomy'].str.contains(r' sp\d+$')] + input_spreadsheet_df = input_spreadsheet_df[~input_spreadsheet_df['species'].str.contains(r' sp\d+$', na=False)] self.stats_include_novel_species = len(input_spreadsheet_df.index) # if include_derived_samples is False then only include rows with 'none' from ncbi_genome_category