hoodini.utils.id_parsing

ID parsing and categorization utilities.

 1"""ID parsing and categorization utilities."""
 2
 3from __future__ import annotations
 4
 5import re
 6
 7
 8def is_refseq_nuccore(nuc_id) -> bool:
 9    """Return True if the nuccore accession is a RefSeq accession else False."""
10    refseq_prefixes = ("NC_", "NZ_", "NM_", "NR_", "XM_", "XR_", "AP_", "YP_", "XP_", "WP_")
11    return isinstance(nuc_id, str) and nuc_id.startswith(refseq_prefixes)
12
13
14def switch_assembly_prefix(asm_id):
15    if not isinstance(asm_id, str):
16        return asm_id
17    if asm_id.startswith("GCA_"):
18        return "GCF_" + asm_id[4:]
19    if asm_id.startswith("GCF_"):
20        return "GCA_" + asm_id[4:]
21    return asm_id
22
23
24def categorize_id(id_: str) -> dict[str, str | None]:
25    parts = id_.split(":")
26    id_part = parts[0]
27
28    uniprot_pattern = re.compile(
29        r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|"
30        r"[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]"
31        r"(?:[A-Z][A-Z0-9]{2}[0-9])?)$"
32    )
33    nucleotide_patterns = [
34        re.compile(
35            r"^("
36            + "|".join(
37                [
38                    "NC",
39                    "NG",
40                    "NM",
41                    "NR",
42                    "NT",
43                    "NW",
44                    "NZ",
45                    "AC",
46                    "AP",
47                    "MT",
48                    "PP",
49                    "OR",
50                    "OZ",
51                    "LR",
52                    "LN",
53                    "KX",
54                ]
55            )
56            + r")(_[A-Z]+\d+|\d+)(\.\d+)?(:\d+-\d+)?$"
57        ),
58        re.compile(r"^[A-Z]{1,2}\d{5,8}(\.\d+)?$"),
59        re.compile(r"^[A-Z]{4,6}\d{8,}(\.\d+)?$"),
60    ]
61    protein_patterns = [
62        re.compile(r"^(" + "|".join(["NP", "XP", "YP", "WP", "ZP"]) + r")_\d+(\.\d+)?$"),
63        re.compile(r"^[A-Z]{3}\d{5,8}(\.\d+)?$"),
64    ]
65
66    if re.match(uniprot_pattern, id_part):
67        return {"type": "uniprot", "id": id_part, "protein_id": None}
68    if any(re.match(pattern, id_part) for pattern in nucleotide_patterns):
69        return {
70            "type": "nucleotide",
71            "id": id_part,
72            "protein_id": parts[1] if len(parts) > 1 else None,
73        }
74    if any(re.match(pattern, id_part) for pattern in protein_patterns):
75        return {"type": "protein", "id": id_part, "protein_id": None}
76    return {"type": "unmatched", "id": id_part, "protein_id": None}
def is_refseq_nuccore(nuc_id) -> bool:
 9def is_refseq_nuccore(nuc_id) -> bool:
10    """Return True if the nuccore accession is a RefSeq accession else False."""
11    refseq_prefixes = ("NC_", "NZ_", "NM_", "NR_", "XM_", "XR_", "AP_", "YP_", "XP_", "WP_")
12    return isinstance(nuc_id, str) and nuc_id.startswith(refseq_prefixes)

Return True if the nuccore accession is a RefSeq accession else False.

def switch_assembly_prefix(asm_id):
15def switch_assembly_prefix(asm_id):
16    if not isinstance(asm_id, str):
17        return asm_id
18    if asm_id.startswith("GCA_"):
19        return "GCF_" + asm_id[4:]
20    if asm_id.startswith("GCF_"):
21        return "GCA_" + asm_id[4:]
22    return asm_id
def categorize_id(id_: str) -> dict[str, str | None]:
25def categorize_id(id_: str) -> dict[str, str | None]:
26    parts = id_.split(":")
27    id_part = parts[0]
28
29    uniprot_pattern = re.compile(
30        r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|"
31        r"[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]"
32        r"(?:[A-Z][A-Z0-9]{2}[0-9])?)$"
33    )
34    nucleotide_patterns = [
35        re.compile(
36            r"^("
37            + "|".join(
38                [
39                    "NC",
40                    "NG",
41                    "NM",
42                    "NR",
43                    "NT",
44                    "NW",
45                    "NZ",
46                    "AC",
47                    "AP",
48                    "MT",
49                    "PP",
50                    "OR",
51                    "OZ",
52                    "LR",
53                    "LN",
54                    "KX",
55                ]
56            )
57            + r")(_[A-Z]+\d+|\d+)(\.\d+)?(:\d+-\d+)?$"
58        ),
59        re.compile(r"^[A-Z]{1,2}\d{5,8}(\.\d+)?$"),
60        re.compile(r"^[A-Z]{4,6}\d{8,}(\.\d+)?$"),
61    ]
62    protein_patterns = [
63        re.compile(r"^(" + "|".join(["NP", "XP", "YP", "WP", "ZP"]) + r")_\d+(\.\d+)?$"),
64        re.compile(r"^[A-Z]{3}\d{5,8}(\.\d+)?$"),
65    ]
66
67    if re.match(uniprot_pattern, id_part):
68        return {"type": "uniprot", "id": id_part, "protein_id": None}
69    if any(re.match(pattern, id_part) for pattern in nucleotide_patterns):
70        return {
71            "type": "nucleotide",
72            "id": id_part,
73            "protein_id": parts[1] if len(parts) > 1 else None,
74        }
75    if any(re.match(pattern, id_part) for pattern in protein_patterns):
76        return {"type": "protein", "id": id_part, "protein_id": None}
77    return {"type": "unmatched", "id": id_part, "protein_id": None}