From 0f3e9fa10e11d57341350c73c9e96ad7a99e0d99 Mon Sep 17 00:00:00 2001
From: Thanh Lee <thanh@cloudbinfies.com>
Date: Tue, 19 May 2026 13:21:51 +0000
Subject: [PATCH] fix(gtdb-parser): respect --use_ncbi_taxonomy in
 novel-species filter

The novel-species filter read the raw gtdb_taxonomy column while
--use_ncbi_taxonomy rewrites only the species column. This caused
genomes with NCBI-named species but GTDB placeholder taxonomy (e.g.
NCBI Shigella flexneri / GTDB s__ECMA0423 sp047199055) to be dropped,
ultimately yielding zero S. flexneri in the Escherichia/Shigella build.

Switch the filter to read the species column so it sees post-rewrite
names. Add na=False to tolerate rows missing a species call, which the
species column (unlike gtdb_taxonomy) does not guarantee to populate.
---
 gambitdb/GtdbSpreadsheetParser.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gambitdb/GtdbSpreadsheetParser.py b/gambitdb/GtdbSpreadsheetParser.py
index f8bdd8a..bf5946d 100755
--- a/gambitdb/GtdbSpreadsheetParser.py
+++ b/gambitdb/GtdbSpreadsheetParser.py
@@ -156,10 +156,14 @@ def filter_input_spreadsheet(self, input_spreadsheet_df):
         input_spreadsheet_df = input_spreadsheet_df[input_spreadsheet_df['contig_count'] <= self.max_contigs]
         self.stats_contig_count = len(input_spreadsheet_df.index)
 
-        # filter spreadsheet so that if the gtdb_taxonomy column ends with ' sp' followed by digits, then remove the row
+        # filter spreadsheet so that if the species column ends with ' sp' followed by digits, then remove the row
         # These are novel species that GTDB has made up that dont exist in NCBI.
+        # We check 'species' (not 'gtdb_taxonomy') so that --use_ncbi_taxonomy is honoured: once
+        # the species column has been rewritten to NCBI names, placeholder GTDB clades (e.g.
+        # "ECMA0423 sp047199055") will no longer match and genuine NCBI species (e.g. "Shigella
+        # flexneri") won't be incorrectly dropped.
         if not self.include_novel_species:
-            input_spreadsheet_df = input_spreadsheet_df[~input_spreadsheet_df['gtdb_taxonomy'].str.contains(r' sp\d+$')]
+            input_spreadsheet_df = input_spreadsheet_df[~input_spreadsheet_df['species'].str.contains(r' sp\d+$', na=False)]
         self.stats_include_novel_species =  len(input_spreadsheet_df.index)
 
         # if include_derived_samples is False then only include rows with 'none' from ncbi_genome_category