hoodini.utils.id_parsing
ID parsing and categorization utilities.
1"""ID parsing and categorization utilities.""" 2 3from __future__ import annotations 4 5import re 6 7 8def is_refseq_nuccore(nuc_id) -> bool: 9 """Return True if the nuccore accession is a RefSeq accession else False.""" 10 refseq_prefixes = ("NC_", "NZ_", "NM_", "NR_", "XM_", "XR_", "AP_", "YP_", "XP_", "WP_") 11 return isinstance(nuc_id, str) and nuc_id.startswith(refseq_prefixes) 12 13 14def switch_assembly_prefix(asm_id): 15 if not isinstance(asm_id, str): 16 return asm_id 17 if asm_id.startswith("GCA_"): 18 return "GCF_" + asm_id[4:] 19 if asm_id.startswith("GCF_"): 20 return "GCA_" + asm_id[4:] 21 return asm_id 22 23 24def categorize_id(id_: str) -> dict[str, str | None]: 25 parts = id_.split(":") 26 id_part = parts[0] 27 28 uniprot_pattern = re.compile( 29 r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|" 30 r"[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]" 31 r"(?:[A-Z][A-Z0-9]{2}[0-9])?)$" 32 ) 33 nucleotide_patterns = [ 34 re.compile( 35 r"^(" 36 + "|".join( 37 [ 38 "NC", 39 "NG", 40 "NM", 41 "NR", 42 "NT", 43 "NW", 44 "NZ", 45 "AC", 46 "AP", 47 "MT", 48 "PP", 49 "OR", 50 "OZ", 51 "LR", 52 "LN", 53 "KX", 54 ] 55 ) 56 + r")(_[A-Z]+\d+|\d+)(\.\d+)?(:\d+-\d+)?$" 57 ), 58 re.compile(r"^[A-Z]{1,2}\d{5,8}(\.\d+)?$"), 59 re.compile(r"^[A-Z]{4,6}\d{8,}(\.\d+)?$"), 60 ] 61 protein_patterns = [ 62 re.compile(r"^(" + "|".join(["NP", "XP", "YP", "WP", "ZP"]) + r")_\d+(\.\d+)?$"), 63 re.compile(r"^[A-Z]{3}\d{5,8}(\.\d+)?$"), 64 ] 65 66 if re.match(uniprot_pattern, id_part): 67 return {"type": "uniprot", "id": id_part, "protein_id": None} 68 if any(re.match(pattern, id_part) for pattern in nucleotide_patterns): 69 return { 70 "type": "nucleotide", 71 "id": id_part, 72 "protein_id": parts[1] if len(parts) > 1 else None, 73 } 74 if any(re.match(pattern, id_part) for pattern in protein_patterns): 75 return {"type": "protein", "id": id_part, "protein_id": None} 76 return {"type": "unmatched", "id": id_part, "protein_id": None}
def
is_refseq_nuccore(nuc_id) -> bool:
9def is_refseq_nuccore(nuc_id) -> bool: 10 """Return True if the nuccore accession is a RefSeq accession else False.""" 11 refseq_prefixes = ("NC_", "NZ_", "NM_", "NR_", "XM_", "XR_", "AP_", "YP_", "XP_", "WP_") 12 return isinstance(nuc_id, str) and nuc_id.startswith(refseq_prefixes)
Return True if the nuccore accession is a RefSeq accession else False.
def
switch_assembly_prefix(asm_id):
def
categorize_id(id_: str) -> dict[str, str | None]:
25def categorize_id(id_: str) -> dict[str, str | None]: 26 parts = id_.split(":") 27 id_part = parts[0] 28 29 uniprot_pattern = re.compile( 30 r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|" 31 r"[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]" 32 r"(?:[A-Z][A-Z0-9]{2}[0-9])?)$" 33 ) 34 nucleotide_patterns = [ 35 re.compile( 36 r"^(" 37 + "|".join( 38 [ 39 "NC", 40 "NG", 41 "NM", 42 "NR", 43 "NT", 44 "NW", 45 "NZ", 46 "AC", 47 "AP", 48 "MT", 49 "PP", 50 "OR", 51 "OZ", 52 "LR", 53 "LN", 54 "KX", 55 ] 56 ) 57 + r")(_[A-Z]+\d+|\d+)(\.\d+)?(:\d+-\d+)?$" 58 ), 59 re.compile(r"^[A-Z]{1,2}\d{5,8}(\.\d+)?$"), 60 re.compile(r"^[A-Z]{4,6}\d{8,}(\.\d+)?$"), 61 ] 62 protein_patterns = [ 63 re.compile(r"^(" + "|".join(["NP", "XP", "YP", "WP", "ZP"]) + r")_\d+(\.\d+)?$"), 64 re.compile(r"^[A-Z]{3}\d{5,8}(\.\d+)?$"), 65 ] 66 67 if re.match(uniprot_pattern, id_part): 68 return {"type": "uniprot", "id": id_part, "protein_id": None} 69 if any(re.match(pattern, id_part) for pattern in nucleotide_patterns): 70 return { 71 "type": "nucleotide", 72 "id": id_part, 73 "protein_id": parts[1] if len(parts) > 1 else None, 74 } 75 if any(re.match(pattern, id_part) for pattern in protein_patterns): 76 return {"type": "protein", "id": id_part, "protein_id": None} 77 return {"type": "unmatched", "id": id_part, "protein_id": None}