hoodini.utils.tree_building
1import itertools 2import sys 3 4import ete3 5 6from hoodini.utils.logging_utils import error, info, warn 7 8 9def calculate_taxid_distances(taxids, update_db=False): 10 """ 11 Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects. 12 13 Parameters: 14 - taxids (list): A list of taxonomic IDs (integers or strings). 15 - update_db (bool): Whether to update the local NCBI taxonomy database. Default is False. 16 17 Returns: 18 - dict: A dictionary with keys as tuples of taxid pairs and values as their distances. 19 20 Raises: 21 - ValueError: If any of the taxids are missing from the taxonomy database. 22 """ 23 24 taxids_str = [str(int(taxid)) for taxid in taxids] 25 26 ncbi = ete3.NCBITaxa() 27 28 if update_db: 29 try: 30 info("Updating NCBI taxonomy database. This may take a while...") 31 ncbi.update_taxonomy_database() 32 info("Taxonomy database updated successfully.") 33 except Exception as e: 34 error(f"Error updating taxonomy database: {e}") 35 sys.exit(1) 36 37 try: 38 tree = ncbi.get_topology(taxids_str, intermediate_nodes=True) 39 except Exception as e: 40 error(f"Error retrieving topology: {e}") 41 sys.exit(1) 42 43 tree_taxids = set() 44 taxid_to_node = {} 45 for node in tree.traverse(): 46 try: 47 taxid = int(node.name) 48 tree_taxids.add(taxid) 49 taxid_to_node[taxid] = node 50 except ValueError: 51 continue 52 53 try: 54 taxids_int = [int(taxid) for taxid in taxids_str] 55 except ValueError as ve: 56 error(f"Error converting taxids to integers: {ve}") 57 sys.exit(1) 58 59 missing_taxids = [taxid for taxid in taxids_int if taxid not in tree_taxids] 60 if missing_taxids: 61 warn("The following taxids are missing from the taxonomy tree:") 62 for mtaxid in missing_taxids: 63 try: 64 name = ncbi.get_taxid_translator([mtaxid]).get(mtaxid, "Unknown") 65 except Exception: 66 name = "Unknown" 67 warn(f" - {mtaxid} ({name})") 68 raise ValueError( 69 "Some taxids are missing from the taxonomy tree. Please verify their validity." 70 ) 71 else: 72 info("All taxids are present in the taxonomy tree.") 73 74 distances = {} 75 76 info("Calculating pairwise distances using node objects...") 77 for taxid1, taxid2 in itertools.combinations(taxids_int, 2): 78 try: 79 node1 = taxid_to_node[taxid1] 80 node2 = taxid_to_node[taxid2] 81 distance = tree.get_distance(node1, node2) 82 distances[(taxid1, taxid2)] = distance 83 except Exception as e: 84 warn(f"Error calculating distance between {taxid1} and {taxid2}: {e}") 85 86 if not distances: 87 raise ValueError("No distances were calculated. Please check the taxids and try again.") 88 89 info("Pairwise distances calculated successfully.") 90 91 return distances
def
calculate_taxid_distances(taxids, update_db=False):
10def calculate_taxid_distances(taxids, update_db=False): 11 """ 12 Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects. 13 14 Parameters: 15 - taxids (list): A list of taxonomic IDs (integers or strings). 16 - update_db (bool): Whether to update the local NCBI taxonomy database. Default is False. 17 18 Returns: 19 - dict: A dictionary with keys as tuples of taxid pairs and values as their distances. 20 21 Raises: 22 - ValueError: If any of the taxids are missing from the taxonomy database. 23 """ 24 25 taxids_str = [str(int(taxid)) for taxid in taxids] 26 27 ncbi = ete3.NCBITaxa() 28 29 if update_db: 30 try: 31 info("Updating NCBI taxonomy database. This may take a while...") 32 ncbi.update_taxonomy_database() 33 info("Taxonomy database updated successfully.") 34 except Exception as e: 35 error(f"Error updating taxonomy database: {e}") 36 sys.exit(1) 37 38 try: 39 tree = ncbi.get_topology(taxids_str, intermediate_nodes=True) 40 except Exception as e: 41 error(f"Error retrieving topology: {e}") 42 sys.exit(1) 43 44 tree_taxids = set() 45 taxid_to_node = {} 46 for node in tree.traverse(): 47 try: 48 taxid = int(node.name) 49 tree_taxids.add(taxid) 50 taxid_to_node[taxid] = node 51 except ValueError: 52 continue 53 54 try: 55 taxids_int = [int(taxid) for taxid in taxids_str] 56 except ValueError as ve: 57 error(f"Error converting taxids to integers: {ve}") 58 sys.exit(1) 59 60 missing_taxids = [taxid for taxid in taxids_int if taxid not in tree_taxids] 61 if missing_taxids: 62 warn("The following taxids are missing from the taxonomy tree:") 63 for mtaxid in missing_taxids: 64 try: 65 name = ncbi.get_taxid_translator([mtaxid]).get(mtaxid, "Unknown") 66 except Exception: 67 name = "Unknown" 68 warn(f" - {mtaxid} ({name})") 69 raise ValueError( 70 "Some taxids are missing from the taxonomy tree. Please verify their validity." 71 ) 72 else: 73 info("All taxids are present in the taxonomy tree.") 74 75 distances = {} 76 77 info("Calculating pairwise distances using node objects...") 78 for taxid1, taxid2 in itertools.combinations(taxids_int, 2): 79 try: 80 node1 = taxid_to_node[taxid1] 81 node2 = taxid_to_node[taxid2] 82 distance = tree.get_distance(node1, node2) 83 distances[(taxid1, taxid2)] = distance 84 except Exception as e: 85 warn(f"Error calculating distance between {taxid1} and {taxid2}: {e}") 86 87 if not distances: 88 raise ValueError("No distances were calculated. Please check the taxids and try again.") 89 90 info("Pairwise distances calculated successfully.") 91 92 return distances
Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects.
Parameters:
- taxids (list): A list of taxonomic IDs (integers or strings).
- update_db (bool): Whether to update the local NCBI taxonomy database. Default is False.
Returns:
- dict: A dictionary with keys as tuples of taxid pairs and values as their distances.
Raises:
- ValueError: If any of the taxids are missing from the taxonomy database.