Subcommands

`subcommand_predict(hypotheticals, cds_dict, output, prefix, cpu, omit_probs, model_dir, model_name, checkpoint_path, batch_size, save_per_residue_embeddings, save_per_protein_embeddings, threads, mask_threshold, has_duplicate_locus, gpus=None)`

Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

Parameters:

Name	Type	Description	Default
`hypotheticals`	`Dict[str, any]`	feature dict for all Bakta hypothetical proteins	required
`cds_dict`	`Dict[str, any]`	id:aa dictionary	required
`output`	`str`	Output directory path.	required
`prefix`	`str`	Prefix for output file names.	required
`cpu`	`bool`	Flag indicating whether to use CPU for prediction.	required
`omit_probs`	`bool`	Flag indicating whether to omit prediction probabilities from ProstT5.	required
`model_dir`	`str`	Directory containing the ProstT5 model.	required
`model_name`	`str`	Name of the ProstT5 model.	required
`checkpoint_path`	`Path`	Path to ProstT5 CNN checkpoint.	required
`batch_size`	`int`	Batch size for prediction.	required
`proteins_flag`	`bool`	True if baktfold proteins-predict, false otherwise	required
`save_per_residue_embeddings`	`bool`	Whether to save per residue embeddings to h5 file. Defaults to False.	required
`save_per_protein_embeddings`	`bool`	Whether to save mean per protein embeddings to h5 file. Defaults to False.	required

Returns:

Name	Type	Description
`hypotheticals`	`Dict[str, any]`	feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)

Source code in src/baktfold/subcommands/predict.py

def subcommand_predict(
    hypotheticals: dict,
    cds_dict: dict,
    output: Path,
    prefix: str,
    cpu: bool,
    omit_probs: bool,
    model_dir: Path,
    model_name: str,
    checkpoint_path: Path,
    batch_size: int,
    save_per_residue_embeddings: bool,
    save_per_protein_embeddings: bool,
    threads: int,
    mask_threshold: float,
    has_duplicate_locus: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

    Args:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins
        cds_dict (Dict[str, any]): id:aa dictionary
        output (str): Output directory path.
        prefix (str): Prefix for output file names.
        cpu (bool): Flag indicating whether to use CPU for prediction.
        omit_probs (bool): Flag indicating whether to omit prediction probabilities from ProstT5.
        model_dir (str): Directory containing the ProstT5 model.
        model_name (str): Name of the ProstT5 model.
        checkpoint_path (Path): Path to ProstT5 CNN checkpoint.
        batch_size (int): Batch size for prediction.
        proteins_flag (bool): True if baktfold proteins-predict, false otherwise
        save_per_residue_embeddings (bool, optional): Whether to save per residue embeddings to h5 file. Defaults to False.
        save_per_protein_embeddings (bool, optional): Whether to save mean per protein embeddings to h5 file. Defaults to False.

    Returns:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)
    """

    logger.info('Predicting 3Di sequences using ProstT5')

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"

    ############
    # prostt5
    ############

    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"
    # embeddings h5 - will only be generated if flag is true
    output_h5_per_residue: Path = Path(output) / f"{prefix}_embeddings_per_residue.h5"
    output_h5_per_protein: Path = Path(output) / f"{prefix}_embeddings_per_protein.h5"

    if cpu is True:
        half_precision = False
    else:
        half_precision = True

    if omit_probs:
        output_probs = False
    else:
        output_probs = True

    prediction_dict = get_embeddings(
        hypotheticals,
        cds_dict,
        output,
        prefix,
        model_dir,
        model_name,
        checkpoint_path,
        fasta_3di,
        output_h5_per_residue,
        output_h5_per_protein,
        half_precision=half_precision,
        max_residues=5000,
        max_seq_len=1000,
        max_batch=batch_size,
        cpu=cpu,
        output_probs=output_probs,
        save_per_residue_embeddings=save_per_residue_embeddings,
        save_per_protein_embeddings=save_per_protein_embeddings,
        threads=threads,
        mask_threshold=mask_threshold,
        has_duplicate_locus=has_duplicate_locus,
        gpus=gpus,
    )

    mask_prop_threshold = mask_threshold/100

    #######
    # update the feature dict with 3Di 
    # easiest just 
    #######

    ########
    ## write the AA CDS to file
    ######


    # check all the lengths of the predictions are >0 in case of OOMs and filter out those that arent
    prediction_dict = {
                k: v for k, v in prediction_dict.items() if len(v[0]) > 0
            }


    with open(fasta_aa, "w") as out_f:
        for cds_id, prot_seq in cds_dict.items():
            try:
                # prediction_dict[cds_id][2]: ProstT5 confidence scores 0-1
                prot_seq = mask_low_confidence_aa(prot_seq, prediction_dict[cds_id][2], threshold=mask_prop_threshold)
            except (KeyError, IndexError):
                prot_seq = "X" * len(prot_seq)

            out_f.write(f">{cds_id}\n{prot_seq}\n")


    return hypotheticals

`subcommand_compare(hypotheticals, output, threads, evalue, sensitivity, database, prefix, predictions_dir, structures, structure_dir, logdir, proteins_flag, max_seqs, ultra_sensitive, extra_foldseek_params, custom_db, foldseek_gpu, custom_annotations, has_duplicate_locus, fast, gpus=None)`

Compare 3Di or PDB structures to the baktfold DB

Parameters:

Name	Type	Description	Default
`hypotheticals`	`Dict`	hypothetical features dictionary	required
`output`	`Path`	Path to the output directory.	required
`threads`	`int`	Number of threads to use.	required
`evalue`	`float`	E-value threshold.	required
`card_vfdb_evalue`	`float`	E-value threshold for CARD and VFDB databases.	required
`sensitivity`	`float`	Sensitivity threshold.	required
`database`	`Path`	Path to the reference database.	required
`prefix`	`str`	Prefix for output files.	required
`predictions_dir`	`Optional[Path]`	Path to the directory containing predictions.	required
`structures`	`bool`	Flag indicating whether structures files are used.	required
`structure_dir`	`Optional[Path]`	Path to the directory containing structures (.pdb or .cif) files.	required
`logdir`	`Path`	Path to the directory for log files.	required
`proteins_flag`	`bool`	Flag indicating whether proteins are used.	required
`max_seqs`	`int`	Maximum results per query sequence allowed to pass the prefilter for foldseek.	required
`ultra_sensitive`	`bool`	Whether to skip foldseek prefilter for maximum sensitivity	required
`extra_foldseek_params`	`str`	Extra foldseek search parameters	required
`custom_db`	`str`	Custom foldseek database	required
`foldseek_gpu`	`bool`	Use Foldseek-GPU acceleration and ungappedprefilter	required
`custom_annotations`	`Optional[Path]`	Path to the tsv containing the custom_db annotations, 2 columns	required
`has_duplicate_locus`	`bool`	If same locus tag has multiple annots (can happen in some euks)	required
`fast`	`bool`	If true, skips AFDB search	required

Returns:

Name	Type	Description
`bool`	`bool`	True if sub-databases are created successfully, False otherwise.

Source code in src/baktfold/subcommands/compare.py

def subcommand_compare(
    hypotheticals: Dict,
    output: Path,
    threads: int,
    evalue: float,
    sensitivity: float,
    database: Path,
    prefix: str,
    predictions_dir: Optional[Path],
    structures: bool,
    structure_dir: Optional[Path],
    logdir: Path,
    proteins_flag: bool,
    max_seqs: int,
    ultra_sensitive: bool,
    extra_foldseek_params: str,
    custom_db: str,
    foldseek_gpu: bool,
    custom_annotations: Optional[Path],
    has_duplicate_locus: bool,
    fast: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Compare 3Di or PDB structures to the baktfold DB

    Parameters:
        hypotheticals (Dict):  hypothetical features dictionary
        output (Path): Path to the output directory.
        threads (int): Number of threads to use.
        evalue (float): E-value threshold.
        card_vfdb_evalue (float): E-value threshold for CARD and VFDB databases.
        sensitivity (float): Sensitivity threshold.
        database (Path): Path to the reference database.
        prefix (str): Prefix for output files.
        predictions_dir (Optional[Path]): Path to the directory containing predictions.
        structures (bool): Flag indicating whether structures files are used.
        structure_dir (Optional[Path]): Path to the directory containing structures (.pdb or .cif) files.
        logdir (Path): Path to the directory for log files.
        proteins_flag (bool): Flag indicating whether proteins are used.
        max_seqs (int): Maximum results per query sequence allowed to pass the prefilter for foldseek.
        ultra_sensitive (bool): Whether to skip foldseek prefilter for maximum sensitivity
        extra_foldseek_params (str): Extra foldseek search parameters
        custom_db (str): Custom foldseek database
        foldseek_gpu (bool): Use Foldseek-GPU acceleration and ungappedprefilter
        custom_annotations (Optional[Path]): Path to the tsv containing the custom_db annotations, 2 columns 
        has_duplicate_locus (bool): If same locus tag has multiple annots (can happen in some euks)
        fast (bool): If true, skips AFDB search
    Returns:
        bool: True if sub-databases are created successfully, False otherwise.
    """


    # input predictions or structures
    if structures is False:
        # prostT5
        fasta_aa_input: Path = Path(predictions_dir) / f"{prefix}_aa.fasta"
        fasta_3di_input: Path = Path(predictions_dir) / f"{prefix}_3di.fasta"

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"

    ## copy the AA and 3Di from predictions directory 
    # if structures is false and baktfold compare is the command
    # Otherwise it will just copy itself

    if structures is False:
        if fasta_3di_input.exists():
            logger.info(
                f"Checked that the 3Di CDS file {fasta_3di_input} exists from baktfold predict"
            )
            if fasta_3di.exists() is False:
                with atomic_write_path(fasta_3di) as tmp:
                    shutil.copyfile(fasta_3di_input, tmp)
        else:
            logger.error(
                f"The 3Di CDS file {fasta_3di_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
            )
        # copy the aa to file
        if fasta_aa_input.exists():
            logger.info(
                f"Checked that the AA CDS file {fasta_aa_input} exists from baktfold predict."
            )
            if fasta_aa.exists() is False:
                with atomic_write_path(fasta_aa) as tmp:
                    shutil.copyfile(fasta_aa_input, tmp)
        else:
            logger.error(
                f"The AA CDS file {fasta_aa_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
                )

    ## write the AAs to file if structures is true because can't just copy from prediction_dir
    else:
        ## write the CDS to file
        logger.info(f"Writing the AAs to file {fasta_aa}.")

        with atomic_write_path(fasta_aa) as tmp_fasta, open(tmp_fasta, "w") as out_f:
            for entry in hypotheticals:
                seq_id = entry['id'] if has_duplicate_locus else entry['locus']
                out_f.write(f">{seq_id}\n{entry['aa']}\n")


    ############
    # create foldseek db
    ############

    foldseek_query_db_path: Path = Path(output) / "foldseek_db"
    foldseek_query_db_path.mkdir(parents=True, exist_ok=True)

    if structures is True:
        logger.info("Creating a foldseek query database from structures.")

        generate_foldseek_db_from_structures(
            fasta_aa,
            foldseek_query_db_path,
            structure_dir,
            logdir,
            prefix,
            proteins_flag,
        )
    else:
        generate_foldseek_db_from_aa_3di(
            fasta_aa, fasta_3di, foldseek_query_db_path, logdir, prefix
        )

    short_db_name = prefix

    # db search 

    database_name = "swissprot"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    #####
    # foldseek search
    #####

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_swissprot.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    swissprot_df = get_tophit(result_tsv, structures, cath=False)




    #####
    # foldseek search AFDB Clusters
    # by default yes, but not if no fast
    #####

    if not fast:

        database_name = "AFDBClusters"

        if short_db_name == database_name:
            logger.error(
                f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
            )

        query_db: Path = Path(foldseek_query_db_path) / short_db_name
        target_db: Path = Path(database) / database_name

        # make result and temp dirs
        result_db_base: Path = Path(output) / "result_db"
        result_db_base.mkdir(parents=True, exist_ok=True)
        result_db: Path = Path(result_db_base) / "result_afdb_db"

        temp_db: Path = Path(output) / "temp_db"
        temp_db.mkdir(parents=True, exist_ok=True)

        # make result tsv
        result_tsv: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"

        # run foldseek search
        run_foldseek_search(
            query_db,
            target_db,
            result_db,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )


        create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

        afdbclusters_df = get_tophit(result_tsv,structures, cath=False)

    else:
        logger.info("Skipping AFDB Clusters search as --fast specified.")

    #####
    # foldseek search pdb
    #####


    database_name = "pdb"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_pdb_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_pdb.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    pdb_df = get_tophit(result_tsv,structures, cath=False)


    #####
    # foldseek search cath
    #####


    database_name = "cath"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_cath_db"
    result_db_greedy_best_hits: Path = Path(result_db_base) / "result_cath_db_greedy_best_hits"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_cath.tsv"
    result_greedy_tsv: Path = Path(output) /  "foldseek_results_cath_greedy_tophit"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )

    # this keeps the greedy best hits for cath
    # we actually don't keep the single tophit - multidomain/fold proteins should have multiple non-overlapping CATH hits
    # this is equivalent to using --greedy-best-hits with foldseek easy-search
    summarise_hits(result_db, result_db_greedy_best_hits, logdir, threads)

    # saves all CATH hits first
    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)
    # save greedy CATH tophits
    create_result_tsv(query_db, target_db, result_db_greedy_best_hits, result_greedy_tsv, logdir, foldseek_gpu, structures, threads)

    # this just reads it in with appropriate headers
    cath_df = get_tophit(result_greedy_tsv, structures, cath=True)

    # write tophits
    swissprot_tophit_path: Path = Path(output) / "baktfold_swissprot_tophit.tsv"
    io.write_foldseek_tophit(swissprot_df, swissprot_tophit_path)

    if not fast:
        afdb_tophit_path: Path = Path(output) / "baktfold_afdbclusters_tophit.tsv"
        io.write_foldseek_tophit(afdbclusters_df, afdb_tophit_path)

    pdb_tophit_path: Path = Path(output) / "baktfold_pdb_tophit.tsv"
    io.write_foldseek_tophit(pdb_df, pdb_tophit_path)

    cath_tophit_path: Path = Path(output) / "baktfold_cath_tophit.tsv"
    io.write_foldseek_tophit(cath_df, cath_tophit_path)
    # remove result_greedy_tsv (identical to tophit, will make it confusing)
    remove_file(result_greedy_tsv) 

    # custom db output 

    #####
    # custom db
    #####


    if custom_db:

        try:

            logger.info(f"Foldseek will also be run against your custom database {custom_db}")
            # make result and temp dirs
            result_db_custom: Path = Path(result_db_base) / "result_db_custom"
            result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"

            run_foldseek_search(
            query_db,
            Path(custom_db),
            result_db_custom,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )

            create_result_tsv(query_db, Path(custom_db),
                result_db_custom,
                result_tsv_custom, logdir, foldseek_gpu, structures, threads)

            custom_df = get_tophit(result_tsv_custom,structures, cath=False)

            custom_db_tophit_path: Path = Path(output) / "baktfold_custom_db_tophit.tsv"
            io.write_foldseek_tophit(custom_df, custom_db_tophit_path)

        except Exception as e:
            logger.error(f"Foldseek failed to run against your custom database {custom_db}. Please check that it is formatted correctly as a Foldseek database: {e}")


    ####
    # covvert pipe back - proteins
    # in theory, from JSON, the pipe shouldn't happen as we only take the ID.

    ####

    """
    https://github.com/steineggerlab/foldseek/blob/8dc75c74ad0eddab73cfd905963d13bf74dc012b/lib/mmseqs/src/commons/Util.cpp#L142
    const struct Databases databases[] = {
            { "uc",   2, 0}, // Uniclust
            { "cl|",   3, 1},
            { "sp|",   3, 1}, // Swiss prot
            { "tr|",   3, 1}, // trembl
            { "gb|",   3, 1}, // GenBank
            { "ref|",  4, 1}, // NCBI Reference Sequence
            { "pdb|",  4, 1}, // Brookhaven Protein Data Bank
            { "bbs|",  4, 1}, // GenInfo Backbone Id
            { "lcl|",  4, 1}, // Local Sequence identifier
            { "pir||", 5, 1}, // NBRF PIR
            { "prf||", 5, 1}, // Protein Research Foundation
            { "gnl|",  4, 2}, // General database identifier
            { "pat|",  4, 2}, // Patents
            { "gi|",   3, 3}  // NCBI GI
    };
    """


    if proteins_flag:


        pipe_warning_logged = False

        new_hypotheticals = []
        for record in hypotheticals:
            rid = record["id"]
            if not pipe_warning_logged:
                if "~PIPE~" in rid:
                    pipe_warning_logged = True
                    logger.warning("At least one input protein has | in the header. Note these will be replaced with ~PIPE~ in the raw foldseek output if you chose --keep-tmp-files")
            record["id"] = record["id"].replace("~PIPE~", "|")
            record["locus"] = record["locus"].replace("~PIPE~", "|")
            new_hypotheticals.append(record)

        hypotheticals = new_hypotheticals
        del new_hypotheticals

        ###
        # overwrite the output aa and 3dis with | if they were in the input

        if pipe_warning_logged:
            replace_pipe_in_fasta(fasta_aa)
            if not structures:
                replace_pipe_in_fasta(fasta_3di)


    ####
    # lookup
    ####

    if proteins_flag: # baktfold proteins 

        # note aas passed as hypotheticals to the overall function - so in and out as aas

        aas = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus=False)
        if not fast:
            aas = pstc.parse(aas, afdbclusters_df, 'afdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, pdb_df, 'pdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, cath_df, 'cath', has_duplicate_locus=False)
        if custom_db:
            aas = pstc.parse(aas, custom_df, 'custom_db', has_duplicate_locus=False)

        # get the lookup descriptions for each of them
        # this requires the DB

        #aas = pstc.lookup(aas, Path(database), custom_annotations)
        aas = pstc.lookup_sql(aas, Path(database), threads)
        # add the custom annotations if it is provided
        if custom_annotations:
            aas = pstc.lookup_custom(aas, Path(database), custom_annotations)

        return aas

    else: # baktfold run

        # add the Swissprot and AFDB and PDB tophits to the json
        hypotheticals = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus)
        if not fast:
            hypotheticals = pstc.parse(hypotheticals, afdbclusters_df, 'afdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, pdb_df, 'pdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, cath_df, 'cath', has_duplicate_locus)
        if custom_db:
            hypotheticals = pstc.parse(hypotheticals, custom_df, 'custom_db', has_duplicate_locus)

        # get the lookup descriptions for each of them
        # hypotheticals = pstc.lookup(hypotheticals, Path(database), custom_annotations)
        hypotheticals = pstc.lookup_sql(hypotheticals, Path(database), threads)
        if custom_annotations:
            hypotheticals = pstc.lookup_custom(hypotheticals, Path(database), custom_annotations)

        return hypotheticals