hoodini.pipeline

1from hoodini.pipeline.runner import run_pipeline
2
3__all__ = ["run_pipeline"]
def run_pipeline(config: hoodini.config.RuntimeConfig) -> None:
 20def run_pipeline(config: RuntimeConfig) -> None:
 21    """Execute the hoodini workflow using the provided config.
 22
 23    Pipeline Stages and File I/O:
 24    ==============================
 25
 26    1. INITIALIZATION (initialize_inputs)
 27        Expects: config.input_path OR config.inputsheet
 28        Generates:
 29        - {output}/ (creates output directory)
 30        Returns: records DataFrame
 31
 32    2. IPG PARSING (run_ipg)
 33        Expects: records DataFrame
 34        Generates: (no files, enriches records with IPG data)
 35        Returns: enriched records DataFrame
 36
 37    3. ASSEMBLY PARSING (run_assembly_parser)
 38        Expects: records DataFrame, assembly_summary.parquet (packaged data)
 39        Generates:
 40        - {output}/assembly_list.txt
 41        - {output}/assembly_folder/{GCA_*}/*.fna, *.gff
 42        - {output}/all_neigh.tsv
 43        - {output}/neighborhood/neighborhoods.fasta
 44        - {output}/temp.gff
 45        Returns: all_gff, all_prots, all_neigh DataFrames + valid_uids
 46
 47    4. PROTEIN COMPARISONS (run_protein_links) [if aai_tree or prot_links]
 48        Expects: all_prots DataFrame
 49        Generates:
 50        - {output}/pairwise_aa.tsv
 51        Returns: pairwise_aa DataFrame
 52
 53    5. NUCLEOTIDE COMPARISONS (run_pairwise_nt) [if ani_tree or nt_links]
 54        Expects: all_neigh, all_gff DataFrames
 55        Generates:
 56        - {output}/ani_matrix.tsv (if ani_mode)
 57        - {output}/nt_links.tsv (if nt_links)
 58        Returns: pairwise_ani, nt_links DataFrames
 59
 60    6. PROTEIN CLUSTERING (cluster_proteins)
 61        Expects: all_prots DataFrame
 62        Generates:
 63        - {output}/target_prots.fasta
 64        - {output}/target_prots.aln (if clust_method != 'none')
 65        Returns: all_prots with fam_cluster column
 66
 67    7. PROTEOME SIMILARITY (run_proteome_similarity) [if aai_tree]
 68        Expects: all_prots, pairwise_aa, all_neigh, all_gff
 69        Generates:
 70        - {output}/aai_matrix.tsv
 71        Returns: pairwise_aai DataFrame
 72
 73    8. TAXONOMY & TREE (parse_taxonomy_and_build_tree)
 74        Expects: records, all_gff, all_neigh, all_prots DataFrames
 75        Generates:
 76        - {output}/tree.nwk
 77        - {output}/records.csv
 78        Returns: tree_str, den_data DataFrame
 79
 80    9. EXTRA ANNOTATIONS (optional tools)
 81        - domains (run_domain): {output}/domains.tsv
 82        - blast (run_blast): enriches all_gff
 83        - padloc (run_padloc): enriches all_prots
 84        - emapper (run_emapper): enriches all_prots
 85        - deffinder (run_defensefinder): enriches all_prots
 86        - cctyper (run_cctyper): {output}/cctyper/, enriches all_prots + all_gff
 87        - ncrna (run_ncrna): {output}/ncrna/results.txt, results.sto, enriches all_gff
 88        - genomad (run_genomad): {output}/genomad/, enriches all_gff
 89
 90    10. VIZ OUTPUTS (write_viz_outputs)
 91        Expects: all_gff, all_neigh, all_prots, den_data, tree_str, optional extras
 92        Generates:
 93            - {output}/hoodini-viz/parquet/*.parquet (gff, hoods, protein_metadata,
 94            tree_metadata, nucleotide_links, protein_links, domains, domains_metadata,
 95            ncrna_metadata)
 96            - {output}/hoodini-viz/tsv/*.txt (corresponding TSV files)
 97            - {output}/hoodini-viz/tree.nwk
 98            - {output}/hoodini-viz/hoodini-viz.html (standalone viewer with embedded data)
 99    """
100
101    stage_header("Initializing Hoodini", "🚀")
102
103    from hoodini.pipeline.initialize import initialize_inputs
104
105    records = initialize_inputs(
106        input_path=config.input_path,
107        inputsheet=config.inputsheet,
108        output=config.output,
109        force=config.force,
110        remote_evalue=config.remote_evalue or 1e-5,
111        remote_max_targets=config.remote_max_targets or 100,
112    )
113
114    stage_done("Initialization complete")
115
116    stage_header("Parsing IPG data", "🔍")
117    from hoodini.pipeline.parse_ipg import run_ipg
118
119    records = run_ipg(
120        records_df=records,
121        cand_mode=config.cand_mode,
122    )
123
124    stage_done("IPG parsing complete")
125
126    stage_header("Downloading and parsing assemblies", "📥")
127    from hoodini.pipeline.parse_assemblies import run_assembly_parser
128
129    result = run_assembly_parser(
130        records_df=records,
131        output_dir=config.output,
132        assembly_folder=config.assembly_folder,
133        ncrna=config.ncrna,
134        cctyper=config.cctyper,
135        genomad=config.genomad,
136        blast=config.blast,
137        apikey=config.apikey,
138        max_concurrent_downloads=config.max_concurrent_downloads,
139        num_threads=config.num_threads,
140        mod=config.mod,
141        wn=config.wn,
142        sorfs=config.sorfs,
143        minwin=config.minwin,
144        minwin_type=config.minwin_type,
145    )
146
147    records = result["records"]
148    all_gff = result["all_gff"]
149    all_prots = result["all_prots"]
150    all_neigh = result["all_neigh"]
151    valid_uids = result["valid_uids"]
152
153    stage_done("Assembly parsing and neighborhood extraction complete")
154
155    # Abort early if nothing was extracted to avoid downstream errors
156    if (all_prots.is_empty() if hasattr(all_prots, "is_empty") else True) or (
157        all_neigh.is_empty() if hasattr(all_neigh, "is_empty") else True
158    ):
159        from hoodini.utils.logging_utils import error
160
161        error("No neighborhoods/proteins extracted; stopping before taxonomy/trees.")
162        return
163
164    if config.tree_mode == "aai_tree" or config.prot_links:
165        stage_header("Running all-vs-all protein comparisons", "🦠")
166        from hoodini.pipeline.protein_links import run_protein_links
167
168        pairwise_aa = run_protein_links(
169            output_dir=config.output,
170            all_prots=all_prots,
171            threads=config.num_threads,
172            evalue=1e-5,
173        )
174
175        stage_done("All-vs-all protein comparisons complete")
176    else:
177        pairwise_aa = None
178
179    if config.tree_mode == "ani_tree" or config.nt_links:
180        stage_header("Running pairwise nucleotide comparisons", "🦠")
181        from hoodini.pipeline.pairwise_nt import run_pairwise_nt
182
183        pairwise_ani, nt_links = run_pairwise_nt(
184            all_neigh=all_neigh,
185            all_gff=all_gff,
186            output_dir=config.output,
187            nt_aln_mode=config.nt_aln_mode,
188            ani_mode=config.ani_mode,
189            nt_links=bool(config.nt_links),
190            threads=config.num_threads,
191        )
192
193        stage_done("Pairwise nucleotide comparisons complete")
194    else:
195        pairwise_ani = None
196        nt_links = None
197
198    stage_header("Clustering neighbor proteins", "✨")
199    from hoodini.pipeline.cluster_proteins import cluster_proteins
200
201    all_prots = cluster_proteins(
202        all_prots,
203        output_dir=config.output,
204        clust_method=config.clust_method,
205        sorfs=config.sorfs,
206    )
207
208    if config.sorfs:
209        discarded_sorfs = all_prots[
210            (all_prots["id"].str.contains("sORF") & all_prots["fam_cluster"].is_null())
211        ]
212        discarded_sorfs["gff_id"] = "ID=" + discarded_sorfs["id"]
213        all_prots = all_prots[~all_prots["id"].isin(discarded_sorfs["id"].unique())]
214        all_gff = all_gff[~(all_gff["attributes"].isin(set(discarded_sorfs["gff_id"].unique())))]
215
216    stage_done("Clustering complete")
217
218    if config.tree_mode == "aai_tree":
219        from hoodini.pipeline.proteome_similarity import run_proteome_similarity
220
221        stage_header("Computing proteome similarity", "🔗")
222        pairwise_aai = run_proteome_similarity(
223            all_prots=all_prots,
224            pairwise_aa=pairwise_aa,
225            all_neigh=all_neigh,
226            all_gff=all_gff,
227            outdir=config.output,
228            mode=config.aai_mode,
229            pident_min=config.min_pident,
230            subset_mode="target_region",
231            win=config.wn,
232            win_mode=(
233                config.mod if hasattr(config, "mod") and config.mod is not None else "win_nts"
234            ),
235            num_threads=config.num_threads,
236        )
237
238        stage_done("Proteome similarity complete")
239    else:
240        pairwise_aai = None
241
242    stage_header("Extracting taxonomic information", "🦠")
243    from hoodini.pipeline.taxonomy import parse_taxonomy_and_build_tree
244
245    tree_str, den_data = parse_taxonomy_and_build_tree(
246        records=records,
247        all_gff=all_gff,
248        all_neigh=all_neigh,
249        all_prots=all_prots,
250        output_dir=config.output,
251        tree_mode=config.tree_mode,
252        tree_file=config.tree_file,
253        num_threads=config.num_threads,
254        valid_uids=valid_uids,
255        aai_mode=config.aai_mode,
256        ani_mode=config.ani_mode,
257        aai_subset_mode=config.aai_subset_mode,
258        nj_algorithm=config.nj_algorithm,
259        pairwise_ani=pairwise_ani,
260        pairwise_aai=pairwise_aai,
261    )
262
263    domains_data = None
264    ncrna_data = None
265
266    if config.domains:
267        from hoodini.extra_tools.domain import run_domain
268
269        domains_data = run_domain(all_prots, config.output, config.domains, config.num_threads)
270
271    if config.blast:
272        from hoodini.extra_tools.blast import run_blast
273
274        blast_data = run_blast(
275            all_neigh, config.output, config.blast, config.num_threads, valid_uids
276        )
277        if blast_data.height > 0:
278            gff_df = pl.DataFrame(
279                {
280                    "seqid": blast_data["seqid"],
281                    "source": "hoodini",
282                    "type": "region",
283                    "start": blast_data["start"],
284                    "end": blast_data["end"],
285                    "score": ".",
286                    "strand": "+",
287                    "phase": ".",
288                    "attributes": "ID=" + blast_data["nc_feature"] + ";",
289                }
290            )
291            all_gff = pl.concat([all_gff, gff_df], how="vertical")
292
293    if config.padloc:
294        from hoodini.extra_tools.padloc import run_padloc
295
296        padloc_df = run_padloc(all_gff, all_prots, config.output, config.num_threads)
297        if padloc_df.height > 0:
298            all_prots = all_prots.join(padloc_df, on="id", how="left")
299
300    if config.emapper:
301        from hoodini.extra_tools.emapper import run_emapper
302
303        emapper_df = run_emapper(all_prots, config.output, config.num_threads)
304
305        if emapper_df.height > 0:
306            if "description" in emapper_df.columns and "product" in all_prots.columns:
307                desc_map = emapper_df.set_index("id")["description"].to_dict()
308                all_prots["product"] = all_prots["product"].where(
309                    all_prots["product"].notna()
310                    & (all_prots["product"].astype(str).str.strip() != ""),
311                    all_prots["id"].map(desc_map),
312                )
313
314            if "id" in emapper_df.columns and "id" in all_prots.columns:
315                all_prots = all_prots.join(emapper_df, on="id", how="left")
316
317    if config.deffinder:
318        from hoodini.extra_tools.defensefinder import run_defensefinder
319
320        deffinder_df = run_defensefinder(all_gff, all_prots, config.output)
321        if deffinder_df.height > 0:
322            all_prots = all_prots.join(deffinder_df, on="id", how="left")
323
324    if config.cctyper:
325        from hoodini.extra_tools.cctyper import run_cctyper
326
327        cctyper_df, crispr_df = run_cctyper(
328            all_gff, all_prots, all_neigh, config.output, config.num_threads, valid_uids
329        )
330        if cctyper_df.height > 0:
331            all_prots = all_prots.join(cctyper_df, on="id", how="left")
332        if crispr_df.height > 0:
333            gff_df = crispr_df.select(
334                [
335                    pl.col("seqid"),
336                    pl.lit("hoodini").alias("source"),
337                    pl.lit("region").alias("type"),
338                    pl.col("start"),
339                    pl.col("end"),
340                    pl.lit(".").alias("score"),
341                    pl.lit(".").alias("strand"),
342                    pl.lit(".").alias("phase"),
343                    (pl.lit("ID=") + pl.col("nc_feature") + pl.lit(";")).alias("attributes"),
344                ]
345            )
346            all_gff = pl.concat([all_gff, gff_df], how="vertical")
347
348    if config.ncrna:
349        from hoodini.extra_tools.ncrna import run_ncrna
350
351        ncrna_data = run_ncrna(all_neigh, den_data, config.output, config.num_threads, valid_uids)
352        if ncrna_data.height > 0:
353            gff_df = ncrna_data.select(
354                [
355                    pl.col("nucid").alias("seqid"),
356                    pl.lit("hoodini").alias("source"),
357                    pl.lit("ncRNA").alias("type"),
358                    pl.min_horizontal([pl.col("start"), pl.col("end")]).alias("start"),
359                    pl.max_horizontal([pl.col("start"), pl.col("end")]).alias("end"),
360                    pl.lit(".").alias("score"),
361                    pl.col("strand_ncrna").alias("strand"),
362                    pl.lit(".").alias("phase"),
363                    (pl.lit("ID=") + pl.col("nc_feature") + pl.lit(";")).alias("attributes"),
364                ]
365            )
366            all_gff = pl.concat([all_gff, gff_df], how="vertical")
367
368    if config.genomad:
369        from hoodini.extra_tools.genomad import run_genomad
370
371        genomad_df = run_genomad(all_neigh, config.output, config.num_threads, valid_uids)
372        if genomad_df.height > 0:
373            gff_df = genomad_df.select(
374                [
375                    pl.col("seqid"),
376                    pl.lit("hoodini").alias("source"),
377                    pl.lit("region").alias("type"),
378                    pl.min_horizontal([pl.col("start"), pl.col("end")]).alias("start"),
379                    pl.max_horizontal([pl.col("start"), pl.col("end")]).alias("end"),
380                    pl.lit(".").alias("score"),
381                    pl.lit(".Z").alias("strand"),
382                    pl.lit(".").alias("phase"),
383                    (pl.lit("ID=") + pl.col("mge_type") + pl.lit(";")).alias("attributes"),
384                ]
385            )
386            all_gff = pl.concat([all_gff, gff_df], how="vertical")
387
388    stage_done("Extra annotation complete")
389
390    from hoodini.pipeline.write_data import write_viz_outputs
391
392    write_viz_outputs(
393        output_dir=config.output,
394        all_gff=all_gff,
395        all_neigh=all_neigh,
396        all_prots=all_prots,
397        den_data=den_data,
398        tree_str=tree_str,
399        nt_links=nt_links,
400        pairwise_aa=pairwise_aa,
401        domains_data=domains_data,
402        write_domains=bool(config.domains),
403        ncrna_data=ncrna_data,
404    )

Execute the hoodini workflow using the provided config.

Pipeline Stages and File I/O:

  1. INITIALIZATION (initialize_inputs) Expects: config.input_path OR config.inputsheet Generates:

    • {output}/ (creates output directory) Returns: records DataFrame
  2. IPG PARSING (run_ipg) Expects: records DataFrame Generates: (no files, enriches records with IPG data) Returns: enriched records DataFrame

  3. ASSEMBLY PARSING (run_assembly_parser) Expects: records DataFrame, assembly_summary.parquet (packaged data) Generates:

    • {output}/assembly_list.txt
    • {output}/assembly_folder/{GCA_}/.fna, *.gff
    • {output}/all_neigh.tsv
    • {output}/neighborhood/neighborhoods.fasta
    • {output}/temp.gff Returns: all_gff, all_prots, all_neigh DataFrames + valid_uids
  4. PROTEIN COMPARISONS (run_protein_links) [if aai_tree or prot_links] Expects: all_prots DataFrame Generates:

    • {output}/pairwise_aa.tsv Returns: pairwise_aa DataFrame
  5. NUCLEOTIDE COMPARISONS (run_pairwise_nt) [if ani_tree or nt_links] Expects: all_neigh, all_gff DataFrames Generates:

    • {output}/ani_matrix.tsv (if ani_mode)
    • {output}/nt_links.tsv (if nt_links) Returns: pairwise_ani, nt_links DataFrames
  6. PROTEIN CLUSTERING (cluster_proteins) Expects: all_prots DataFrame Generates:

    • {output}/target_prots.fasta
    • {output}/target_prots.aln (if clust_method != 'none') Returns: all_prots with fam_cluster column
  7. PROTEOME SIMILARITY (run_proteome_similarity) [if aai_tree] Expects: all_prots, pairwise_aa, all_neigh, all_gff Generates:

    • {output}/aai_matrix.tsv Returns: pairwise_aai DataFrame
  8. TAXONOMY & TREE (parse_taxonomy_and_build_tree) Expects: records, all_gff, all_neigh, all_prots DataFrames Generates:

    • {output}/tree.nwk
    • {output}/records.csv Returns: tree_str, den_data DataFrame
  9. EXTRA ANNOTATIONS (optional tools)

    • domains (run_domain): {output}/domains.tsv
    • blast (run_blast): enriches all_gff
    • padloc (run_padloc): enriches all_prots
    • emapper (run_emapper): enriches all_prots
    • deffinder (run_defensefinder): enriches all_prots
    • cctyper (run_cctyper): {output}/cctyper/, enriches all_prots + all_gff
    • ncrna (run_ncrna): {output}/ncrna/results.txt, results.sto, enriches all_gff
    • genomad (run_genomad): {output}/genomad/, enriches all_gff
  10. VIZ OUTPUTS (write_viz_outputs) Expects: all_gff, all_neigh, all_prots, den_data, tree_str, optional extras Generates: - {output}/hoodini-viz/parquet/*.parquet (gff, hoods, protein_metadata, tree_metadata, nucleotide_links, protein_links, domains, domains_metadata, ncrna_metadata) - {output}/hoodini-viz/tsv/*.txt (corresponding TSV files) - {output}/hoodini-viz/tree.nwk - {output}/hoodini-viz/hoodini-viz.html (standalone viewer with embedded data)