diff --git a/VERSION b/VERSION index b1e80bb..845639e 100755 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.3 +0.1.4 diff --git a/gambitdb/SplitSpecies.py b/gambitdb/SplitSpecies.py index 37c56a9..511069b 100755 --- a/gambitdb/SplitSpecies.py +++ b/gambitdb/SplitSpecies.py @@ -85,9 +85,10 @@ def split_high_diameter_species(self): if subspecies is not None and not subspecies.empty: # 2+ viable clusters — subspeciate species = pd.concat([species, subspecies], ignore_index=False, sort=False) - species.loc[ - species["name"] == single_species[1]["name"], "diameter" - ] = 0.0 + parent_name = single_species[1]["name"] + parent_count = int((genome_metadata["species"] == parent_name).sum()) + species.loc[species["name"] == parent_name, "diameter"] = 0.0 + species.loc[species["name"] == parent_name, "ngenomes"] = parent_count elif subspecies is not None and subspecies.empty: # Singleton outliers removed, 1 cluster remains — keep species with recalculated diameter species.loc[ @@ -102,6 +103,8 @@ def split_high_diameter_species(self): genome_metadata["species_taxid"] == single_species[0] ] self.accessions_removed.extend(genome_accessions.index.tolist()) + genome_metadata.drop(genome_accessions.index, inplace=True) + species = species[species["name"] != single_species[1]["name"]] return species, genome_metadata, self.accessions_removed @@ -183,11 +186,11 @@ def split_single_high_diameter_species_into_subspecies( "All clusters are singletons for species %s, removing entirely", single_species[1]["name"] ) - self.save_small_clusters_accessions_removed(small_clusters, single_species) + self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata) return None, genome_metadata, single_species # Always record singleton accessions as removed - self.save_small_clusters_accessions_removed(small_clusters, single_species) + self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata) if num_clusters == 1: # Single viable cluster remains after removing singletons. @@ -317,22 +320,14 @@ def create_subspecies_from_clusters( return subspecies, genome_metadata, single_species - def save_small_clusters_accessions_removed(self, small_clusters, single_species): + def save_small_clusters_accessions_removed(self, small_clusters, single_species, genome_metadata): """ - Saves the accessions of small clusters to a file. - Args: - small_clusters (DataFrame): A DataFrame containing the small clusters. - single_species (DataFrame): A DataFrame containing the single species. - Returns: - None - Side Effects: - Updates the accessions_removed attribute. - Examples: - >>> save_small_clusters_accessions_removed(small_clusters, single_species) + Records singleton/small-cluster accessions as removed and drops them from genome_metadata + so downstream counts reflect the post-removal state. """ - # save the accessions of the small clusters to a file small_clusters_accessions = small_clusters["assembly_accession"].tolist() self.accessions_removed = self.accessions_removed + small_clusters_accessions + genome_metadata.drop(small_clusters_accessions, inplace=True) self.logger.debug( "Remove small clusters: " diff --git a/gambitdb/tests/SplitSpecies_test.py b/gambitdb/tests/SplitSpecies_test.py index 7c5b49c..502dd78 100755 --- a/gambitdb/tests/SplitSpecies_test.py +++ b/gambitdb/tests/SplitSpecies_test.py @@ -91,6 +91,14 @@ def test_split_species(self): self.assertEqual(s.shape[0], 5) self.assertEqual(g.shape[0], 13) self.assertEqual(len(accessions_removed), 0) + # Parent species 'Yellow black' was subspeciated; its genomes are + # reassigned to the subspecies rows, so parent ngenomes must be 0. + parent_row = s[s['name'] == 'Yellow black'] + self.assertEqual(len(parent_row), 1) + self.assertEqual(int(parent_row['ngenomes'].iloc[0]), 0) + self.assertEqual(float(parent_row['diameter'].iloc[0]), 0.0) + # No genomes in genome_metadata should still be labeled with the parent name. + self.assertEqual((g['species'] == 'Yellow black').sum(), 0) def test_two_genome_high_diameter_species_removed(self): """ @@ -107,12 +115,15 @@ def test_two_genome_high_diameter_species_removed(self): 1, 'average', False) s, g, accessions_removed = ss.split_high_diameter_species() # Yellow black (2 genomes, diameter 0.9) should be removed entirely - # Remaining: Yellow white + Yellow genus = 2 species rows, diameter set to 0.0 for Yellow black + # Remaining: Yellow white + Yellow genus = 2 species rows self.assertNotIn('Yellow black subspecies', ' '.join(s['name'].tolist())) - # GCA_1 and GCA_2 should be in accessions_removed + self.assertNotIn('Yellow black', s['name'].tolist()) + # Both genomes should be removed from genome_metadata and recorded in accessions_removed self.assertIn('GCA_1', accessions_removed) self.assertIn('GCA_2', accessions_removed) self.assertEqual(len(accessions_removed), 2) + self.assertNotIn('GCA_1', g.index) + self.assertNotIn('GCA_2', g.index) def test_singleton_outliers_removed_species_kept(self): """ @@ -145,6 +156,11 @@ def test_singleton_outliers_removed_species_kept(self): self.assertEqual(int(red_black['ngenomes'].iloc[0]), 3) # No subspecies should have been created self.assertNotIn('subspecies', ' '.join(s['name'].tolist())) + # Singleton outliers GCA_4 and GCA_5 should have been physically dropped + # from genome_metadata (8 original rows - 2 removed = 6). + self.assertEqual(g.shape[0], 6) + self.assertNotIn('GCA_4', g.index) + self.assertNotIn('GCA_5', g.index) def test_all_singletons_species_removed(self): """ @@ -193,6 +209,11 @@ def test_all_singletons_species_removed(self): self.assertIn('GCA_1', accessions_removed) self.assertIn('GCA_2', accessions_removed) self.assertIn('GCA_3', accessions_removed) + # Species row should be dropped entirely; genomes dropped from metadata + self.assertNotIn('All apart', s['name'].tolist()) + self.assertNotIn('GCA_1', g.index) + self.assertNotIn('GCA_2', g.index) + self.assertNotIn('GCA_3', g.index) finally: os.unlink(pw_path) diff --git a/gambitdb/tests/data/gambitdb/pw-dists.csv b/gambitdb/tests/data/gambitdb/pw-dists.csv new file mode 100644 index 0000000..392a552 --- /dev/null +++ b/gambitdb/tests/data/gambitdb/pw-dists.csv @@ -0,0 +1,5 @@ +,s2_overlap_s1,s3_partial_overlap_s1,s4_no_overlap,sample1 +s2_overlap_s1,0.0000,0.0000,0.0000,0.0000 +s3_partial_overlap_s1,0.0000,0.0000,0.0000,0.0000 +s4_no_overlap,0.0000,0.0000,0.0000,0.0000 +sample1,0.0000,0.0000,0.0000,0.0000 diff --git a/gambitdb/tests/data/gambitdb/signatures.h5 b/gambitdb/tests/data/gambitdb/signatures.h5 new file mode 100644 index 0000000..13e2dfd Binary files /dev/null and b/gambitdb/tests/data/gambitdb/signatures.h5 differ