Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.3
0.1.4
29 changes: 12 additions & 17 deletions gambitdb/SplitSpecies.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,10 @@ def split_high_diameter_species(self):
if subspecies is not None and not subspecies.empty:
# 2+ viable clusters — subspeciate
species = pd.concat([species, subspecies], ignore_index=False, sort=False)
species.loc[
species["name"] == single_species[1]["name"], "diameter"
] = 0.0
parent_name = single_species[1]["name"]
parent_count = int((genome_metadata["species"] == parent_name).sum())
species.loc[species["name"] == parent_name, "diameter"] = 0.0
species.loc[species["name"] == parent_name, "ngenomes"] = parent_count
elif subspecies is not None and subspecies.empty:
# Singleton outliers removed, 1 cluster remains — keep species with recalculated diameter
species.loc[
Expand All @@ -102,6 +103,8 @@ def split_high_diameter_species(self):
genome_metadata["species_taxid"] == single_species[0]
]
self.accessions_removed.extend(genome_accessions.index.tolist())
genome_metadata.drop(genome_accessions.index, inplace=True)
species = species[species["name"] != single_species[1]["name"]]

return species, genome_metadata, self.accessions_removed

Expand Down Expand Up @@ -183,11 +186,11 @@ def split_single_high_diameter_species_into_subspecies(
"All clusters are singletons for species %s, removing entirely",
single_species[1]["name"]
)
self.save_small_clusters_accessions_removed(small_clusters, single_species)
self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata)
return None, genome_metadata, single_species

# Always record singleton accessions as removed
self.save_small_clusters_accessions_removed(small_clusters, single_species)
self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata)

if num_clusters == 1:
# Single viable cluster remains after removing singletons.
Expand Down Expand Up @@ -317,22 +320,14 @@ def create_subspecies_from_clusters(

return subspecies, genome_metadata, single_species

def save_small_clusters_accessions_removed(self, small_clusters, single_species):
def save_small_clusters_accessions_removed(self, small_clusters, single_species, genome_metadata):
"""
Saves the accessions of small clusters to a file.
Args:
small_clusters (DataFrame): A DataFrame containing the small clusters.
single_species (DataFrame): A DataFrame containing the single species.
Returns:
None
Side Effects:
Updates the accessions_removed attribute.
Examples:
>>> save_small_clusters_accessions_removed(small_clusters, single_species)
Records singleton/small-cluster accessions as removed and drops them from genome_metadata
so downstream counts reflect the post-removal state.
"""
# save the accessions of the small clusters to a file
small_clusters_accessions = small_clusters["assembly_accession"].tolist()
self.accessions_removed = self.accessions_removed + small_clusters_accessions
genome_metadata.drop(small_clusters_accessions, inplace=True)

self.logger.debug(
"Remove small clusters: "
Expand Down
25 changes: 23 additions & 2 deletions gambitdb/tests/SplitSpecies_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ def test_split_species(self):
self.assertEqual(s.shape[0], 5)
self.assertEqual(g.shape[0], 13)
self.assertEqual(len(accessions_removed), 0)
# Parent species 'Yellow black' was subspeciated; its genomes are
# reassigned to the subspecies rows, so parent ngenomes must be 0.
parent_row = s[s['name'] == 'Yellow black']
self.assertEqual(len(parent_row), 1)
self.assertEqual(int(parent_row['ngenomes'].iloc[0]), 0)
self.assertEqual(float(parent_row['diameter'].iloc[0]), 0.0)
# No genomes in genome_metadata should still be labeled with the parent name.
self.assertEqual((g['species'] == 'Yellow black').sum(), 0)

def test_two_genome_high_diameter_species_removed(self):
"""
Expand All @@ -107,12 +115,15 @@ def test_two_genome_high_diameter_species_removed(self):
1, 'average', False)
s, g, accessions_removed = ss.split_high_diameter_species()
# Yellow black (2 genomes, diameter 0.9) should be removed entirely
# Remaining: Yellow white + Yellow genus = 2 species rows, diameter set to 0.0 for Yellow black
# Remaining: Yellow white + Yellow genus = 2 species rows
self.assertNotIn('Yellow black subspecies', ' '.join(s['name'].tolist()))
# GCA_1 and GCA_2 should be in accessions_removed
self.assertNotIn('Yellow black', s['name'].tolist())
# Both genomes should be removed from genome_metadata and recorded in accessions_removed
self.assertIn('GCA_1', accessions_removed)
self.assertIn('GCA_2', accessions_removed)
self.assertEqual(len(accessions_removed), 2)
self.assertNotIn('GCA_1', g.index)
self.assertNotIn('GCA_2', g.index)

def test_singleton_outliers_removed_species_kept(self):
"""
Expand Down Expand Up @@ -145,6 +156,11 @@ def test_singleton_outliers_removed_species_kept(self):
self.assertEqual(int(red_black['ngenomes'].iloc[0]), 3)
# No subspecies should have been created
self.assertNotIn('subspecies', ' '.join(s['name'].tolist()))
# Singleton outliers GCA_4 and GCA_5 should have been physically dropped
# from genome_metadata (8 original rows - 2 removed = 6).
self.assertEqual(g.shape[0], 6)
self.assertNotIn('GCA_4', g.index)
self.assertNotIn('GCA_5', g.index)

def test_all_singletons_species_removed(self):
"""
Expand Down Expand Up @@ -193,6 +209,11 @@ def test_all_singletons_species_removed(self):
self.assertIn('GCA_1', accessions_removed)
self.assertIn('GCA_2', accessions_removed)
self.assertIn('GCA_3', accessions_removed)
# Species row should be dropped entirely; genomes dropped from metadata
self.assertNotIn('All apart', s['name'].tolist())
self.assertNotIn('GCA_1', g.index)
self.assertNotIn('GCA_2', g.index)
self.assertNotIn('GCA_3', g.index)
finally:
os.unlink(pw_path)

Expand Down
5 changes: 5 additions & 0 deletions gambitdb/tests/data/gambitdb/pw-dists.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
,s2_overlap_s1,s3_partial_overlap_s1,s4_no_overlap,sample1
s2_overlap_s1,0.0000,0.0000,0.0000,0.0000
s3_partial_overlap_s1,0.0000,0.0000,0.0000,0.0000
s4_no_overlap,0.0000,0.0000,0.0000,0.0000
sample1,0.0000,0.0000,0.0000,0.0000
Binary file added gambitdb/tests/data/gambitdb/signatures.h5
Binary file not shown.
Loading