hoodini.pipeline
20def run_pipeline(config: RuntimeConfig) -> None: 21 """Execute the hoodini workflow using the provided config. 22 23 Pipeline Stages and File I/O: 24 ============================== 25 26 1. INITIALIZATION (initialize_inputs) 27 Expects: config.input_path OR config.inputsheet 28 Generates: 29 - {output}/ (creates output directory) 30 Returns: records DataFrame 31 32 2. IPG PARSING (run_ipg) 33 Expects: records DataFrame 34 Generates: (no files, enriches records with IPG data) 35 Returns: enriched records DataFrame 36 37 3. ASSEMBLY PARSING (run_assembly_parser) 38 Expects: records DataFrame, assembly_summary.parquet (packaged data) 39 Generates: 40 - {output}/assembly_list.txt 41 - {output}/assembly_folder/{GCA_*}/*.fna, *.gff 42 - {output}/all_neigh.tsv 43 - {output}/neighborhood/neighborhoods.fasta 44 - {output}/temp.gff 45 Returns: all_gff, all_prots, all_neigh DataFrames + valid_uids 46 47 4. PROTEIN COMPARISONS (run_protein_links) [if aai_tree or prot_links] 48 Expects: all_prots DataFrame 49 Generates: 50 - {output}/pairwise_aa.tsv 51 Returns: pairwise_aa DataFrame 52 53 5. NUCLEOTIDE COMPARISONS (run_pairwise_nt) [if ani_tree or nt_links] 54 Expects: all_neigh, all_gff DataFrames 55 Generates: 56 - {output}/ani_matrix.tsv (if ani_mode) 57 - {output}/nt_links.tsv (if nt_links) 58 Returns: pairwise_ani, nt_links DataFrames 59 60 6. PROTEIN CLUSTERING (cluster_proteins) 61 Expects: all_prots DataFrame 62 Generates: 63 - {output}/target_prots.fasta 64 - {output}/target_prots.aln (if clust_method != 'none') 65 Returns: all_prots with fam_cluster column 66 67 7. PROTEOME SIMILARITY (run_proteome_similarity) [if aai_tree] 68 Expects: all_prots, pairwise_aa, all_neigh, all_gff 69 Generates: 70 - {output}/aai_matrix.tsv 71 Returns: pairwise_aai DataFrame 72 73 8. TAXONOMY & TREE (parse_taxonomy_and_build_tree) 74 Expects: records, all_gff, all_neigh, all_prots DataFrames 75 Generates: 76 - {output}/tree.nwk 77 - {output}/records.csv 78 Returns: tree_str, den_data DataFrame 79 80 9. EXTRA ANNOTATIONS (optional tools) 81 - domains (run_domain): {output}/domains.tsv 82 - blast (run_blast): enriches all_gff 83 - padloc (run_padloc): enriches all_prots 84 - emapper (run_emapper): enriches all_prots 85 - deffinder (run_defensefinder): enriches all_prots 86 - cctyper (run_cctyper): {output}/cctyper/, enriches all_prots + all_gff 87 - ncrna (run_ncrna): {output}/ncrna/results.txt, results.sto, enriches all_gff 88 - genomad (run_genomad): {output}/genomad/, enriches all_gff 89 90 10. VIZ OUTPUTS (write_viz_outputs) 91 Expects: all_gff, all_neigh, all_prots, den_data, tree_str, optional extras 92 Generates: 93 - {output}/hoodini-viz/parquet/*.parquet (gff, hoods, protein_metadata, 94 tree_metadata, nucleotide_links, protein_links, domains, domains_metadata, 95 ncrna_metadata) 96 - {output}/hoodini-viz/tsv/*.txt (corresponding TSV files) 97 - {output}/hoodini-viz/tree.nwk 98 - {output}/hoodini-viz/hoodini-viz.html (standalone viewer with embedded data) 99 """ 100 101 stage_header("Initializing Hoodini", "🚀") 102 103 from hoodini.pipeline.initialize import initialize_inputs 104 105 records = initialize_inputs( 106 input_path=config.input_path, 107 inputsheet=config.inputsheet, 108 output=config.output, 109 force=config.force, 110 remote_evalue=config.remote_evalue or 1e-5, 111 remote_max_targets=config.remote_max_targets or 100, 112 ) 113 114 stage_done("Initialization complete") 115 116 stage_header("Parsing IPG data", "🔍") 117 from hoodini.pipeline.parse_ipg import run_ipg 118 119 records = run_ipg( 120 records_df=records, 121 cand_mode=config.cand_mode, 122 ) 123 124 stage_done("IPG parsing complete") 125 126 stage_header("Downloading and parsing assemblies", "📥") 127 from hoodini.pipeline.parse_assemblies import run_assembly_parser 128 129 result = run_assembly_parser( 130 records_df=records, 131 output_dir=config.output, 132 assembly_folder=config.assembly_folder, 133 ncrna=config.ncrna, 134 cctyper=config.cctyper, 135 genomad=config.genomad, 136 blast=config.blast, 137 apikey=config.apikey, 138 max_concurrent_downloads=config.max_concurrent_downloads, 139 num_threads=config.num_threads, 140 mod=config.mod, 141 wn=config.wn, 142 sorfs=config.sorfs, 143 minwin=config.minwin, 144 minwin_type=config.minwin_type, 145 ) 146 147 records = result["records"] 148 all_gff = result["all_gff"] 149 all_prots = result["all_prots"] 150 all_neigh = result["all_neigh"] 151 valid_uids = result["valid_uids"] 152 153 stage_done("Assembly parsing and neighborhood extraction complete") 154 155 # Abort early if nothing was extracted to avoid downstream errors 156 if (all_prots.is_empty() if hasattr(all_prots, "is_empty") else True) or ( 157 all_neigh.is_empty() if hasattr(all_neigh, "is_empty") else True 158 ): 159 from hoodini.utils.logging_utils import error 160 161 error("No neighborhoods/proteins extracted; stopping before taxonomy/trees.") 162 return 163 164 if config.tree_mode == "aai_tree" or config.prot_links: 165 stage_header("Running all-vs-all protein comparisons", "🦠") 166 from hoodini.pipeline.protein_links import run_protein_links 167 168 pairwise_aa = run_protein_links( 169 output_dir=config.output, 170 all_prots=all_prots, 171 threads=config.num_threads, 172 evalue=1e-5, 173 ) 174 175 stage_done("All-vs-all protein comparisons complete") 176 else: 177 pairwise_aa = None 178 179 if config.tree_mode == "ani_tree" or config.nt_links: 180 stage_header("Running pairwise nucleotide comparisons", "🦠") 181 from hoodini.pipeline.pairwise_nt import run_pairwise_nt 182 183 pairwise_ani, nt_links = run_pairwise_nt( 184 all_neigh=all_neigh, 185 all_gff=all_gff, 186 output_dir=config.output, 187 nt_aln_mode=config.nt_aln_mode, 188 ani_mode=config.ani_mode, 189 nt_links=bool(config.nt_links), 190 threads=config.num_threads, 191 ) 192 193 stage_done("Pairwise nucleotide comparisons complete") 194 else: 195 pairwise_ani = None 196 nt_links = None 197 198 stage_header("Clustering neighbor proteins", "✨") 199 from hoodini.pipeline.cluster_proteins import cluster_proteins 200 201 all_prots = cluster_proteins( 202 all_prots, 203 output_dir=config.output, 204 clust_method=config.clust_method, 205 sorfs=config.sorfs, 206 ) 207 208 if config.sorfs: 209 discarded_sorfs = all_prots[ 210 (all_prots["id"].str.contains("sORF") & all_prots["fam_cluster"].is_null()) 211 ] 212 discarded_sorfs["gff_id"] = "ID=" + discarded_sorfs["id"] 213 all_prots = all_prots[~all_prots["id"].isin(discarded_sorfs["id"].unique())] 214 all_gff = all_gff[~(all_gff["attributes"].isin(set(discarded_sorfs["gff_id"].unique())))] 215 216 stage_done("Clustering complete") 217 218 if config.tree_mode == "aai_tree": 219 from hoodini.pipeline.proteome_similarity import run_proteome_similarity 220 221 stage_header("Computing proteome similarity", "🔗") 222 pairwise_aai = run_proteome_similarity( 223 all_prots=all_prots, 224 pairwise_aa=pairwise_aa, 225 all_neigh=all_neigh, 226 all_gff=all_gff, 227 outdir=config.output, 228 mode=config.aai_mode, 229 pident_min=config.min_pident, 230 subset_mode="target_region", 231 win=config.wn, 232 win_mode=( 233 config.mod if hasattr(config, "mod") and config.mod is not None else "win_nts" 234 ), 235 num_threads=config.num_threads, 236 ) 237 238 stage_done("Proteome similarity complete") 239 else: 240 pairwise_aai = None 241 242 stage_header("Extracting taxonomic information", "🦠") 243 from hoodini.pipeline.taxonomy import parse_taxonomy_and_build_tree 244 245 tree_str, den_data = parse_taxonomy_and_build_tree( 246 records=records, 247 all_gff=all_gff, 248 all_neigh=all_neigh, 249 all_prots=all_prots, 250 output_dir=config.output, 251 tree_mode=config.tree_mode, 252 tree_file=config.tree_file, 253 num_threads=config.num_threads, 254 valid_uids=valid_uids, 255 aai_mode=config.aai_mode, 256 ani_mode=config.ani_mode, 257 aai_subset_mode=config.aai_subset_mode, 258 nj_algorithm=config.nj_algorithm, 259 pairwise_ani=pairwise_ani, 260 pairwise_aai=pairwise_aai, 261 ) 262 263 domains_data = None 264 ncrna_data = None 265 266 if config.domains: 267 from hoodini.extra_tools.domain import run_domain 268 269 domains_data = run_domain(all_prots, config.output, config.domains, config.num_threads) 270 271 if config.blast: 272 from hoodini.extra_tools.blast import run_blast 273 274 blast_data = run_blast( 275 all_neigh, config.output, config.blast, config.num_threads, valid_uids 276 ) 277 if blast_data.height > 0: 278 gff_df = pl.DataFrame( 279 { 280 "seqid": blast_data["seqid"], 281 "source": "hoodini", 282 "type": "region", 283 "start": blast_data["start"], 284 "end": blast_data["end"], 285 "score": ".", 286 "strand": "+", 287 "phase": ".", 288 "attributes": "ID=" + blast_data["nc_feature"] + ";", 289 } 290 ) 291 all_gff = pl.concat([all_gff, gff_df], how="vertical") 292 293 if config.padloc: 294 from hoodini.extra_tools.padloc import run_padloc 295 296 padloc_df = run_padloc(all_gff, all_prots, config.output, config.num_threads) 297 if padloc_df.height > 0: 298 all_prots = all_prots.join(padloc_df, on="id", how="left") 299 300 if config.emapper: 301 from hoodini.extra_tools.emapper import run_emapper 302 303 emapper_df = run_emapper(all_prots, config.output, config.num_threads) 304 305 if emapper_df.height > 0: 306 if "description" in emapper_df.columns and "product" in all_prots.columns: 307 desc_map = emapper_df.set_index("id")["description"].to_dict() 308 all_prots["product"] = all_prots["product"].where( 309 all_prots["product"].notna() 310 & (all_prots["product"].astype(str).str.strip() != ""), 311 all_prots["id"].map(desc_map), 312 ) 313 314 if "id" in emapper_df.columns and "id" in all_prots.columns: 315 all_prots = all_prots.join(emapper_df, on="id", how="left") 316 317 if config.deffinder: 318 from hoodini.extra_tools.defensefinder import run_defensefinder 319 320 deffinder_df = run_defensefinder(all_gff, all_prots, config.output) 321 if deffinder_df.height > 0: 322 all_prots = all_prots.join(deffinder_df, on="id", how="left") 323 324 if config.cctyper: 325 from hoodini.extra_tools.cctyper import run_cctyper 326 327 cctyper_df, crispr_df = run_cctyper( 328 all_gff, all_prots, all_neigh, config.output, config.num_threads, valid_uids 329 ) 330 if cctyper_df.height > 0: 331 all_prots = all_prots.join(cctyper_df, on="id", how="left") 332 if crispr_df.height > 0: 333 gff_df = crispr_df.select( 334 [ 335 pl.col("seqid"), 336 pl.lit("hoodini").alias("source"), 337 pl.lit("region").alias("type"), 338 pl.col("start"), 339 pl.col("end"), 340 pl.lit(".").alias("score"), 341 pl.lit(".").alias("strand"), 342 pl.lit(".").alias("phase"), 343 (pl.lit("ID=") + pl.col("nc_feature") + pl.lit(";")).alias("attributes"), 344 ] 345 ) 346 all_gff = pl.concat([all_gff, gff_df], how="vertical") 347 348 if config.ncrna: 349 from hoodini.extra_tools.ncrna import run_ncrna 350 351 ncrna_data = run_ncrna(all_neigh, den_data, config.output, config.num_threads, valid_uids) 352 if ncrna_data.height > 0: 353 gff_df = ncrna_data.select( 354 [ 355 pl.col("nucid").alias("seqid"), 356 pl.lit("hoodini").alias("source"), 357 pl.lit("ncRNA").alias("type"), 358 pl.min_horizontal([pl.col("start"), pl.col("end")]).alias("start"), 359 pl.max_horizontal([pl.col("start"), pl.col("end")]).alias("end"), 360 pl.lit(".").alias("score"), 361 pl.col("strand_ncrna").alias("strand"), 362 pl.lit(".").alias("phase"), 363 (pl.lit("ID=") + pl.col("nc_feature") + pl.lit(";")).alias("attributes"), 364 ] 365 ) 366 all_gff = pl.concat([all_gff, gff_df], how="vertical") 367 368 if config.genomad: 369 from hoodini.extra_tools.genomad import run_genomad 370 371 genomad_df = run_genomad(all_neigh, config.output, config.num_threads, valid_uids) 372 if genomad_df.height > 0: 373 gff_df = genomad_df.select( 374 [ 375 pl.col("seqid"), 376 pl.lit("hoodini").alias("source"), 377 pl.lit("region").alias("type"), 378 pl.min_horizontal([pl.col("start"), pl.col("end")]).alias("start"), 379 pl.max_horizontal([pl.col("start"), pl.col("end")]).alias("end"), 380 pl.lit(".").alias("score"), 381 pl.lit(".Z").alias("strand"), 382 pl.lit(".").alias("phase"), 383 (pl.lit("ID=") + pl.col("mge_type") + pl.lit(";")).alias("attributes"), 384 ] 385 ) 386 all_gff = pl.concat([all_gff, gff_df], how="vertical") 387 388 stage_done("Extra annotation complete") 389 390 from hoodini.pipeline.write_data import write_viz_outputs 391 392 write_viz_outputs( 393 output_dir=config.output, 394 all_gff=all_gff, 395 all_neigh=all_neigh, 396 all_prots=all_prots, 397 den_data=den_data, 398 tree_str=tree_str, 399 nt_links=nt_links, 400 pairwise_aa=pairwise_aa, 401 domains_data=domains_data, 402 write_domains=bool(config.domains), 403 ncrna_data=ncrna_data, 404 )
Execute the hoodini workflow using the provided config.
Pipeline Stages and File I/O:
INITIALIZATION (initialize_inputs) Expects: config.input_path OR config.inputsheet Generates:
- {output}/ (creates output directory) Returns: records DataFrame
IPG PARSING (run_ipg) Expects: records DataFrame Generates: (no files, enriches records with IPG data) Returns: enriched records DataFrame
ASSEMBLY PARSING (run_assembly_parser) Expects: records DataFrame, assembly_summary.parquet (packaged data) Generates:
- {output}/assembly_list.txt
- {output}/assembly_folder/{GCA_}/.fna, *.gff
- {output}/all_neigh.tsv
- {output}/neighborhood/neighborhoods.fasta
- {output}/temp.gff Returns: all_gff, all_prots, all_neigh DataFrames + valid_uids
PROTEIN COMPARISONS (run_protein_links) [if aai_tree or prot_links] Expects: all_prots DataFrame Generates:
- {output}/pairwise_aa.tsv Returns: pairwise_aa DataFrame
NUCLEOTIDE COMPARISONS (run_pairwise_nt) [if ani_tree or nt_links] Expects: all_neigh, all_gff DataFrames Generates:
- {output}/ani_matrix.tsv (if ani_mode)
- {output}/nt_links.tsv (if nt_links) Returns: pairwise_ani, nt_links DataFrames
PROTEIN CLUSTERING (cluster_proteins) Expects: all_prots DataFrame Generates:
- {output}/target_prots.fasta
- {output}/target_prots.aln (if clust_method != 'none') Returns: all_prots with fam_cluster column
PROTEOME SIMILARITY (run_proteome_similarity) [if aai_tree] Expects: all_prots, pairwise_aa, all_neigh, all_gff Generates:
- {output}/aai_matrix.tsv Returns: pairwise_aai DataFrame
TAXONOMY & TREE (parse_taxonomy_and_build_tree) Expects: records, all_gff, all_neigh, all_prots DataFrames Generates:
- {output}/tree.nwk
- {output}/records.csv Returns: tree_str, den_data DataFrame
EXTRA ANNOTATIONS (optional tools)
- domains (run_domain): {output}/domains.tsv
- blast (run_blast): enriches all_gff
- padloc (run_padloc): enriches all_prots
- emapper (run_emapper): enriches all_prots
- deffinder (run_defensefinder): enriches all_prots
- cctyper (run_cctyper): {output}/cctyper/, enriches all_prots + all_gff
- ncrna (run_ncrna): {output}/ncrna/results.txt, results.sto, enriches all_gff
- genomad (run_genomad): {output}/genomad/, enriches all_gff
VIZ OUTPUTS (write_viz_outputs) Expects: all_gff, all_neigh, all_prots, den_data, tree_str, optional extras Generates: - {output}/hoodini-viz/parquet/*.parquet (gff, hoods, protein_metadata, tree_metadata, nucleotide_links, protein_links, domains, domains_metadata, ncrna_metadata) - {output}/hoodini-viz/tsv/*.txt (corresponding TSV files) - {output}/hoodini-viz/tree.nwk - {output}/hoodini-viz/hoodini-viz.html (standalone viewer with embedded data)