hoodini.utils.tree_building

 1import itertools
 2import sys
 3
 4import ete3
 5
 6from hoodini.utils.logging_utils import error, info, warn
 7
 8
 9def calculate_taxid_distances(taxids, update_db=False):
10    """
11    Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects.
12
13    Parameters:
14    - taxids (list): A list of taxonomic IDs (integers or strings).
15    - update_db (bool): Whether to update the local NCBI taxonomy database. Default is False.
16
17    Returns:
18    - dict: A dictionary with keys as tuples of taxid pairs and values as their distances.
19
20    Raises:
21    - ValueError: If any of the taxids are missing from the taxonomy database.
22    """
23
24    taxids_str = [str(int(taxid)) for taxid in taxids]
25
26    ncbi = ete3.NCBITaxa()
27
28    if update_db:
29        try:
30            info("Updating NCBI taxonomy database. This may take a while...")
31            ncbi.update_taxonomy_database()
32            info("Taxonomy database updated successfully.")
33        except Exception as e:
34            error(f"Error updating taxonomy database: {e}")
35            sys.exit(1)
36
37    try:
38        tree = ncbi.get_topology(taxids_str, intermediate_nodes=True)
39    except Exception as e:
40        error(f"Error retrieving topology: {e}")
41        sys.exit(1)
42
43    tree_taxids = set()
44    taxid_to_node = {}
45    for node in tree.traverse():
46        try:
47            taxid = int(node.name)
48            tree_taxids.add(taxid)
49            taxid_to_node[taxid] = node
50        except ValueError:
51            continue
52
53    try:
54        taxids_int = [int(taxid) for taxid in taxids_str]
55    except ValueError as ve:
56        error(f"Error converting taxids to integers: {ve}")
57        sys.exit(1)
58
59    missing_taxids = [taxid for taxid in taxids_int if taxid not in tree_taxids]
60    if missing_taxids:
61        warn("The following taxids are missing from the taxonomy tree:")
62        for mtaxid in missing_taxids:
63            try:
64                name = ncbi.get_taxid_translator([mtaxid]).get(mtaxid, "Unknown")
65            except Exception:
66                name = "Unknown"
67            warn(f" - {mtaxid} ({name})")
68        raise ValueError(
69            "Some taxids are missing from the taxonomy tree. Please verify their validity."
70        )
71    else:
72        info("All taxids are present in the taxonomy tree.")
73
74    distances = {}
75
76    info("Calculating pairwise distances using node objects...")
77    for taxid1, taxid2 in itertools.combinations(taxids_int, 2):
78        try:
79            node1 = taxid_to_node[taxid1]
80            node2 = taxid_to_node[taxid2]
81            distance = tree.get_distance(node1, node2)
82            distances[(taxid1, taxid2)] = distance
83        except Exception as e:
84            warn(f"Error calculating distance between {taxid1} and {taxid2}: {e}")
85
86    if not distances:
87        raise ValueError("No distances were calculated. Please check the taxids and try again.")
88
89    info("Pairwise distances calculated successfully.")
90
91    return distances
def calculate_taxid_distances(taxids, update_db=False):
10def calculate_taxid_distances(taxids, update_db=False):
11    """
12    Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects.
13
14    Parameters:
15    - taxids (list): A list of taxonomic IDs (integers or strings).
16    - update_db (bool): Whether to update the local NCBI taxonomy database. Default is False.
17
18    Returns:
19    - dict: A dictionary with keys as tuples of taxid pairs and values as their distances.
20
21    Raises:
22    - ValueError: If any of the taxids are missing from the taxonomy database.
23    """
24
25    taxids_str = [str(int(taxid)) for taxid in taxids]
26
27    ncbi = ete3.NCBITaxa()
28
29    if update_db:
30        try:
31            info("Updating NCBI taxonomy database. This may take a while...")
32            ncbi.update_taxonomy_database()
33            info("Taxonomy database updated successfully.")
34        except Exception as e:
35            error(f"Error updating taxonomy database: {e}")
36            sys.exit(1)
37
38    try:
39        tree = ncbi.get_topology(taxids_str, intermediate_nodes=True)
40    except Exception as e:
41        error(f"Error retrieving topology: {e}")
42        sys.exit(1)
43
44    tree_taxids = set()
45    taxid_to_node = {}
46    for node in tree.traverse():
47        try:
48            taxid = int(node.name)
49            tree_taxids.add(taxid)
50            taxid_to_node[taxid] = node
51        except ValueError:
52            continue
53
54    try:
55        taxids_int = [int(taxid) for taxid in taxids_str]
56    except ValueError as ve:
57        error(f"Error converting taxids to integers: {ve}")
58        sys.exit(1)
59
60    missing_taxids = [taxid for taxid in taxids_int if taxid not in tree_taxids]
61    if missing_taxids:
62        warn("The following taxids are missing from the taxonomy tree:")
63        for mtaxid in missing_taxids:
64            try:
65                name = ncbi.get_taxid_translator([mtaxid]).get(mtaxid, "Unknown")
66            except Exception:
67                name = "Unknown"
68            warn(f" - {mtaxid} ({name})")
69        raise ValueError(
70            "Some taxids are missing from the taxonomy tree. Please verify their validity."
71        )
72    else:
73        info("All taxids are present in the taxonomy tree.")
74
75    distances = {}
76
77    info("Calculating pairwise distances using node objects...")
78    for taxid1, taxid2 in itertools.combinations(taxids_int, 2):
79        try:
80            node1 = taxid_to_node[taxid1]
81            node2 = taxid_to_node[taxid2]
82            distance = tree.get_distance(node1, node2)
83            distances[(taxid1, taxid2)] = distance
84        except Exception as e:
85            warn(f"Error calculating distance between {taxid1} and {taxid2}: {e}")
86
87    if not distances:
88        raise ValueError("No distances were calculated. Please check the taxids and try again.")
89
90    info("Pairwise distances calculated successfully.")
91
92    return distances

Calculate pairwise distances between given taxonomic IDs using ete3 by passing node objects.

Parameters:

  • taxids (list): A list of taxonomic IDs (integers or strings).
  • update_db (bool): Whether to update the local NCBI taxonomy database. Default is False.

Returns:

  • dict: A dictionary with keys as tuples of taxid pairs and values as their distances.

Raises:

  • ValueError: If any of the taxids are missing from the taxonomy database.