Index

`calc_md5_sum(tarball_path, buffer_size=1024 * 1024)`

Calculate the MD5 checksum of the given file.

Parameters:

Name	Type	Description	Default
`tarball_path`	`Path`	The path to the file for which the MD5 checksum needs to be calculated.	required
`buffer_size`	`int`	The buffer size for reading the file.	`1024 * 1024`

Returns:

Name	Type	Description
`str`	`str`	The MD5 checksum of the file.

Source code in src/baktfold/databases/db.py

def calc_md5_sum(tarball_path: Path, buffer_size: int = 1024 * 1024) -> str:
    """
    Calculate the MD5 checksum of the given file.

    Args:
        tarball_path (Path): The path to the file for which the MD5 checksum needs to be calculated.
        buffer_size (int): The buffer size for reading the file.

    Returns:
        str: The MD5 checksum of the file.
    """

    md5 = hashlib.md5()
    with tarball_path.open("rb") as fh:
        data = fh.read(buffer_size)
        while data:
            md5.update(data)
            data = fh.read(buffer_size)
    return md5.hexdigest()

`check_db_installation(db_dir, foldseek_gpu)`

Check if the baktfold database is installed.

Parameters:

Name	Type	Description	Default
`db_dir`	`Path`	The directory where the database is installed.	required
`foldseek_gpu`	`bool`	Whether to install foldseek-gpu compatible baktfold db	required

Returns:

Name	Type	Description
`bool`	`bool`	True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py

def check_db_installation(db_dir: Path, foldseek_gpu: bool) -> bool:
    """
    Check if the baktfold database is installed.

    Args:
        db_dir Path: The directory where the database is installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    downloaded_flag = True
    for file_name in BAKTFOLD_DB_NAMES:
        path = Path(db_dir) / file_name
        if not path.is_file():
            logger.warning(f"baktfold Database file {path} is missing")
            downloaded_flag = False
            break

    gpu_flag = True
    if foldseek_gpu:
        for file_name in baktfold_DB_FOLDSEEK_GPU_NAMES:
            path = Path(db_dir) / file_name
            if not path.is_file():
                logger.warning(f"baktfold Foldseek-GPU Database file {path} is missing")
                gpu_flag = False
                break 

    return downloaded_flag, gpu_flag

`check_prostT5_download(model_dir, model_name)`

Args: model_dir (Path): Directory where the model and tokenizer is be stored. model_name (str): Name of the pre-trained T5 model.

Returns:

Name	Type	Description
`bool`	`bool`	bool to tell baktfold whether to download ProstT5

Source code in src/baktfold/databases/db.py

def check_prostT5_download(model_dir: Path, model_name: str) -> bool:
    """
     Args:
        model_dir (Path): Directory where the model and tokenizer is be stored.
        model_name (str): Name of the pre-trained T5 model.
    Returns:
        bool: bool to tell baktfold whether to download ProstT5
    """

    # assumes already has been downloaded
    download = False

    if model_name == "Rostlab/ProstT5_fp16":

        model_sub_dir = "models--Rostlab--ProstT5_fp16"
        DICT = PROSTT5_MD5_DICTIONARY


    for key in DICT:
        for nested_key in DICT[key]:
            file_path = Path(
                f"{model_dir}/{model_sub_dir}/{key}/{nested_key}"
            )

            # check file exists
            if file_path.exists():
                md5_sum = calc_md5_sum(file_path)
                if md5_sum != DICT[key][nested_key]:
                    logger.warning(
                        f"Corrupt model file {file_path}! MD5 should be '{DICT[key][nested_key]}' but is '{md5_sum}'"
                    )
                    download = True
            else:
                logger.warning(f"Model file {file_path} does not exist.")
                download = True

    return download

`download(tarball_path, cache_dir)`

Download the baktfold database from HuggingFace.

Leaves the HF cache intact (copy, don't move) so subsequent installs can reuse the cached blob. shutil.move had three problems: 1. Broke HF cache integrity — moved the blob out from under the snapshots/<rev>/ symlink, making re-installs re-download. 2. Cross-device-unsafe — on shared storage, move falls back to copy-then-delete; an interrupt left both ends corrupt. 3. Non-atomic — an interrupted move left a half-written tarball. Fix: copy into a sibling temp via atomic_write_path; renamed onto tarball_path on success, cleaned up on any failure.

Source code in src/baktfold/databases/db.py

def download(tarball_path: Path, cache_dir: Path) -> None:
    """Download the baktfold database from HuggingFace.

    Leaves the HF cache intact (copy, don't move) so subsequent installs
    can reuse the cached blob. ``shutil.move`` had three problems:
    1. Broke HF cache integrity — moved the blob out from under the
       ``snapshots/<rev>/`` symlink, making re-installs re-download.
    2. Cross-device-unsafe — on shared storage, move falls back to
       copy-then-delete; an interrupt left both ends corrupt.
    3. Non-atomic — an interrupted move left a half-written tarball.
    Fix: copy into a sibling temp via ``atomic_write_path``; renamed
    onto ``tarball_path`` on success, cleaned up on any failure.
    """

    hf_tarball_path = hf_hub_download(
        repo_id="gbouras13/baktfold-db",
        repo_type="dataset",
        filename="baktfold_db.tar.gz",
        cache_dir=f"{cache_dir}",
    )

    # HF returns a symlink under snapshots/<rev>/; resolve to the real blob.
    real_tarball = Path(hf_tarball_path).resolve()
    tarball_path.parent.mkdir(parents=True, exist_ok=True)

    with atomic_write_path(tarball_path) as tmp_path:
        shutil.copyfile(real_tarball, tmp_path)

    logger.info(f"Tarball saved to {tarball_path}")

`download_requests(db_url, tarball_path)`

Downloads a file from a given URL using the requests library.

Parameters:

Name	Type	Description	Default
`db_url`	`str`	The URL of the file to download.	required
`tarball_path`	`Path`	The path to save the downloaded file.	required

Returns:

Type	Description
	None

Examples:

>>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))

Source code in src/baktfold/databases/db.py

def download_requests(db_url: str, tarball_path: Path):
    """
    Downloads a file from a given URL using the requests library.

    Args:
      db_url (str): The URL of the file to download.
      tarball_path (Path): The path to save the downloaded file.

    Returns:
      None

    Examples:
      >>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))
    """

    headers = {
        "User-Agent": f"baktfold/{CURRENT_DB_VERSION} (contact: george.bouras@adelaide.edu.au)"
    }

    try:
        with requests.get(
            db_url,
            stream=True,
            headers=headers,
            timeout=_DOWNLOAD_TIMEOUT,
        ) as resp:
            resp.raise_for_status()

            total_length = resp.headers.get("content-length")
            if total_length is not None:
                total_length = int(total_length)

            with atomic_write_path(tarball_path) as tmp_path:
                with tmp_path.open("wb") as fh_out, alive_bar(
                    total=total_length, scale="SI"
                ) as bar:
                    for data in resp.iter_content(chunk_size=1024 * 1024):
                        fh_out.write(data)
                        bar(count=len(data))
    except requests.exceptions.RequestException:
        logger.error(
            f"ERROR: Could not download file from Zenodo! url={db_url}, path={tarball_path}"
        )
        raise

`download_zenodo_prostT5(model_dir, logdir, threads)`

Download the ProstT5 model from Zenodo

Parameters:

Name	Type	Description	Default
`db_url`	`str`	The URL of the database.	required
`tarball_path`	`Path`	The path where the downloaded tarball should be saved.	required

Source code in src/baktfold/databases/db.py

def download_zenodo_prostT5(model_dir, logdir, threads):
    """
    Download the ProstT5 model from Zenodo

    Args:
        db_url (str): The URL of the database.
        tarball_path (Path): The path where the downloaded tarball should be saved.
    """

    db_url = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_url"]
    requiredmd5 = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_md5"]

    logger.info(f"Downloading ProstT5 model backup from {db_url}")

    tarball = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_tarball"]
    tarball_path = Path(f"{model_dir}/{tarball}")
    download_requests(db_url, tarball_path)

    md5_sum = calc_md5_sum(tarball_path)

    if md5_sum == requiredmd5:
        logger.info(f"ProstT5 model backup file download OK: {md5_sum}")
    else:
        corrupt_path = tarball_path.with_suffix(tarball_path.suffix + ".corrupt")
        try:
            tarball_path.replace(corrupt_path)
        except OSError:
            try:
                tarball_path.unlink()
            except FileNotFoundError:
                pass
            corrupt_path = None
        raise RuntimeError(
            f"Corrupt ProstT5 model backup tarball: MD5 should be "
            f"'{requiredmd5}' but is '{md5_sum}'. "
            + (
                f"Bad file preserved at {corrupt_path} for inspection."
                if corrupt_path is not None
                else "Bad file removed."
            )
        )

    logger.info(
        f"Extracting ProstT5 model backup tarball: file={tarball_path}, output={model_dir}"
    )

    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            _safe_extractall(tar_file, model_dir)

    except (OSError, tarfile.TarError) as e:
        logger.error(
            f"Could not extract ProstT5 tarball {tarball_path} to {model_dir}: "
            f"{type(e).__name__}: {e}. The tarball has been preserved for inspection."
        )
        raise

    tarball_path.unlink()

`foldseek_makepaddedseqdb(db_dir)`

Runs the Foldseek makepaddedseqdb command on a given database directory.

Parameters:

Name	Type	Description	Default
`db_dir`	`Path`	The path to the database directory.	required

Returns:

Type	Description
`None`	None

Examples:

>>> foldseek_makepaddedseqdb(Path("baktfold_db"))

Source code in src/baktfold/databases/db.py

def foldseek_makepaddedseqdb(db_dir: Path) -> None:
    """
    Runs the Foldseek makepaddedseqdb command on a given database directory.

    Args:
      db_dir (Path): The path to the database directory.

    Returns:
      None

    Examples:
      >>> foldseek_makepaddedseqdb(Path("baktfold_db"))
    """

    dbs = ["AFDBClusters", "pdb", "cath", "swissprot"]
    logdir = Path(db_dir) / "logdir"

    for db_name in dbs:
        db_path = Path(db_dir) / db_name
        db_path_gpu = Path(db_dir) / f"{db_name}_gpu"

        foldseek_makepaddedseqdb = ExternalTool(
            tool="foldseek",
            input="",
            output="",
            params=f"makepaddedseqdb {db_path} {db_path_gpu}",
            logdir=logdir,
        )

        ExternalTool.run_tool(foldseek_makepaddedseqdb)

`install_database(db_dir, foldseek_gpu, threads)`

Install the baktfold database.

Parameters:

Name	Type	Description	Default
`db_dir`	`Path`	The directory where the database should be installed.	required
`foldseek_gpu`	`bool`	Whether to install foldseek-gpu compatible baktfold db	required
`threads`	`int`	Number of threads available (makes downloading faster)	required

Source code in src/baktfold/databases/db.py

def install_database(db_dir: Path, foldseek_gpu: bool, threads: int) -> None:
    """
    Install the baktfold database.

    Args:
        db_dir Path: The directory where the database should be installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db
        threads int: Number of threads available (makes downloading faster)
    """

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {db_dir}.")
    downloaded_flag, gpu_flag = check_db_installation(db_dir, foldseek_gpu)
    if downloaded_flag:
        logger.info("All baktfold databases files are present")
    else:
        logger.info("Some baktfold databases files are missing")

        DICT = VERSION_DICTIONARY
        db_url = DICT[CURRENT_DB_VERSION]["db_url"]
        logger.info(f"Downloading baktfold DB")

        requiredmd5s = DICT[CURRENT_DB_VERSION]["md5"]
        tarball = DICT[CURRENT_DB_VERSION]["tarball"]

        tarball_path = Path(f"{db_dir}/{tarball}")
        logdir = Path(db_dir) / "logdir"

        try:
            logger.info(f"Downloading from HuggingFace")
            download(tarball_path, db_dir)
        except Exception as e:
            # Narrowed from bare ``except`` so that KeyboardInterrupt /
            # SystemExit are not silently swallowed.
            logger.warning(
                f"Could not download file from HuggingFace: path={tarball_path} ({type(e).__name__}: {e})"
            )
            logger.warning(f"Trying now with requests")
            download_requests(db_url, tarball_path)


        md5_sum = calc_md5_sum(tarball_path)


        if md5_sum in requiredmd5s:
            logger.info(f"baktfold database file download OK: {md5_sum}")
        else:
            corrupt_path = tarball_path.with_suffix(tarball_path.suffix + ".corrupt")
            try:
                tarball_path.replace(corrupt_path)
            except OSError:
                try:
                    tarball_path.unlink()
                except FileNotFoundError:
                    pass
                corrupt_path = None
            raise RuntimeError(
                f"Corrupt baktfold database tarball: MD5 should be "
                f"'{requiredmd5s}' but is '{md5_sum}'. "
                + (
                    f"Bad file preserved at {corrupt_path} for inspection."
                    if corrupt_path is not None
                    else "Bad file removed."
                )
            )

        logger.info(
            f"Extracting baktfold database tarball: file={tarball_path}, output={db_dir}"
        )
        # untar raises on extract failure, so unlink is reached only on success.
        untar(tarball_path, db_dir, DICT)
        tarball_path.unlink()

    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.info("Some baktfold database files compatible with Foldseek-GPU are missing")
            logger.info("Creating them")
            foldseek_makepaddedseqdb(db_dir)

    logger.info("Database download and processing complete")

`untar(tarball_path, output_path, DICT)`

Extract the tarball to the output path.

Parameters:

Name	Type	Description	Default
`tarball_path`	`Path`	The path to the tarball file.	required
`output_path`	`Path`	The path where the contents of the tarball should be extracted.	required
`DICT`	`dict`	version dictionary	required

Source code in src/baktfold/databases/db.py

def untar(tarball_path: Path, output_path: Path, DICT: dict) -> None:
    """
    Extract the tarball to the output path.

    Args:
        tarball_path (Path): The path to the tarball file.
        output_path (Path): The path where the contents of the tarball should be extracted.
        DICT (dict): version dictionary
    """
    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            _safe_extractall(tar_file, output_path)

        tarpath = Path(output_path) / DICT[CURRENT_DB_VERSION]["dir_name"]

        # Get a list of all files in the directory
        files_to_move = [f for f in tarpath.iterdir() if f.is_file()]

        # Move each file to the destination directory
        for file_name in files_to_move:
            destination_path = output_path / file_name.name
            shutil.move(file_name, destination_path)
        # remove the directory
        remove_directory(tarpath)

    except (OSError, tarfile.TarError) as e:
        logger.error(
            f"Could not extract {tarball_path} to {output_path}: "
            f"{type(e).__name__}: {e}. The tarball has been preserved for inspection."
        )
        raise

`validate_db(database, default_dir, foldseek_gpu)`

Validates the baktfold database is installed.

Parameters:

Name	Type	Description	Default
`database`	`str`	The directory where the database is installed.	required
`default_dir`	`str`	Default DB location	required
`foldseek_gpu`	`bool`	Whether to install foldseek-gpu compatible baktfold db	required

Returns:

Name	Type	Description
`bool`	`Path`	True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py

def validate_db(database: str, default_dir: str, foldseek_gpu: bool) -> Path:
    """
    Validates the baktfold database is installed.

    Args:
        database str: The directory where the database is installed.
        default_dir str: Default DB location
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    # set default DB if not specified
    if database is not None:
        database: Path = Path(database)
    else:
        database = Path(default_dir)

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {database}")
    downloaded_flag, gpu_flag = check_db_installation(database, foldseek_gpu)
    if downloaded_flag == True:
        logger.info("All baktfold databases files are present")
    else:
        if database == Path(default_dir):  # default
            logger.error(
                f"baktfold database not found. Please run baktfold install to download and install the baktfold database"
            )
        else:  # specific
            logger.error(
                f"baktfold database not found. Please run baktfold install -d {database} to download and install the baktfold database"
            )
    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.error(
                f"baktfold database files compatible with Foldseek-GPU not found. Please run baktfold install -d {database} --foldseek-gpu"
            )


    return database

`subcommand_predict(hypotheticals, cds_dict, output, prefix, cpu, omit_probs, model_dir, model_name, checkpoint_path, batch_size, save_per_residue_embeddings, save_per_protein_embeddings, threads, mask_threshold, has_duplicate_locus, gpus=None)`

Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

Parameters:

Name	Type	Description	Default
`hypotheticals`	`Dict[str, any]`	feature dict for all Bakta hypothetical proteins	required
`cds_dict`	`Dict[str, any]`	id:aa dictionary	required
`output`	`str`	Output directory path.	required
`prefix`	`str`	Prefix for output file names.	required
`cpu`	`bool`	Flag indicating whether to use CPU for prediction.	required
`omit_probs`	`bool`	Flag indicating whether to omit prediction probabilities from ProstT5.	required
`model_dir`	`str`	Directory containing the ProstT5 model.	required
`model_name`	`str`	Name of the ProstT5 model.	required
`checkpoint_path`	`Path`	Path to ProstT5 CNN checkpoint.	required
`batch_size`	`int`	Batch size for prediction.	required
`proteins_flag`	`bool`	True if baktfold proteins-predict, false otherwise	required
`save_per_residue_embeddings`	`bool`	Whether to save per residue embeddings to h5 file. Defaults to False.	required
`save_per_protein_embeddings`	`bool`	Whether to save mean per protein embeddings to h5 file. Defaults to False.	required

Returns:

Name	Type	Description
`hypotheticals`	`Dict[str, any]`	feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)

Source code in src/baktfold/subcommands/predict.py

def subcommand_predict(
    hypotheticals: dict,
    cds_dict: dict,
    output: Path,
    prefix: str,
    cpu: bool,
    omit_probs: bool,
    model_dir: Path,
    model_name: str,
    checkpoint_path: Path,
    batch_size: int,
    save_per_residue_embeddings: bool,
    save_per_protein_embeddings: bool,
    threads: int,
    mask_threshold: float,
    has_duplicate_locus: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

    Args:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins
        cds_dict (Dict[str, any]): id:aa dictionary
        output (str): Output directory path.
        prefix (str): Prefix for output file names.
        cpu (bool): Flag indicating whether to use CPU for prediction.
        omit_probs (bool): Flag indicating whether to omit prediction probabilities from ProstT5.
        model_dir (str): Directory containing the ProstT5 model.
        model_name (str): Name of the ProstT5 model.
        checkpoint_path (Path): Path to ProstT5 CNN checkpoint.
        batch_size (int): Batch size for prediction.
        proteins_flag (bool): True if baktfold proteins-predict, false otherwise
        save_per_residue_embeddings (bool, optional): Whether to save per residue embeddings to h5 file. Defaults to False.
        save_per_protein_embeddings (bool, optional): Whether to save mean per protein embeddings to h5 file. Defaults to False.

    Returns:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)
    """

    logger.info('Predicting 3Di sequences using ProstT5')

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"

    ############
    # prostt5
    ############

    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"
    # embeddings h5 - will only be generated if flag is true
    output_h5_per_residue: Path = Path(output) / f"{prefix}_embeddings_per_residue.h5"
    output_h5_per_protein: Path = Path(output) / f"{prefix}_embeddings_per_protein.h5"

    if cpu is True:
        half_precision = False
    else:
        half_precision = True

    if omit_probs:
        output_probs = False
    else:
        output_probs = True

    prediction_dict = get_embeddings(
        hypotheticals,
        cds_dict,
        output,
        prefix,
        model_dir,
        model_name,
        checkpoint_path,
        fasta_3di,
        output_h5_per_residue,
        output_h5_per_protein,
        half_precision=half_precision,
        max_residues=5000,
        max_seq_len=1000,
        max_batch=batch_size,
        cpu=cpu,
        output_probs=output_probs,
        save_per_residue_embeddings=save_per_residue_embeddings,
        save_per_protein_embeddings=save_per_protein_embeddings,
        threads=threads,
        mask_threshold=mask_threshold,
        has_duplicate_locus=has_duplicate_locus,
        gpus=gpus,
    )

    mask_prop_threshold = mask_threshold/100

    #######
    # update the feature dict with 3Di 
    # easiest just 
    #######

    ########
    ## write the AA CDS to file
    ######


    # check all the lengths of the predictions are >0 in case of OOMs and filter out those that arent
    prediction_dict = {
                k: v for k, v in prediction_dict.items() if len(v[0]) > 0
            }


    with open(fasta_aa, "w") as out_f:
        for cds_id, prot_seq in cds_dict.items():
            try:
                # prediction_dict[cds_id][2]: ProstT5 confidence scores 0-1
                prot_seq = mask_low_confidence_aa(prot_seq, prediction_dict[cds_id][2], threshold=mask_prop_threshold)
            except (KeyError, IndexError):
                prot_seq = "X" * len(prot_seq)

            out_f.write(f">{cds_id}\n{prot_seq}\n")


    return hypotheticals

`subcommand_compare(hypotheticals, output, threads, evalue, sensitivity, database, prefix, predictions_dir, structures, structure_dir, logdir, proteins_flag, max_seqs, ultra_sensitive, extra_foldseek_params, custom_db, foldseek_gpu, custom_annotations, has_duplicate_locus, fast, gpus=None)`

Compare 3Di or PDB structures to the baktfold DB

Parameters:

Name	Type	Description	Default
`hypotheticals`	`Dict`	hypothetical features dictionary	required
`output`	`Path`	Path to the output directory.	required
`threads`	`int`	Number of threads to use.	required
`evalue`	`float`	E-value threshold.	required
`card_vfdb_evalue`	`float`	E-value threshold for CARD and VFDB databases.	required
`sensitivity`	`float`	Sensitivity threshold.	required
`database`	`Path`	Path to the reference database.	required
`prefix`	`str`	Prefix for output files.	required
`predictions_dir`	`Optional[Path]`	Path to the directory containing predictions.	required
`structures`	`bool`	Flag indicating whether structures files are used.	required
`structure_dir`	`Optional[Path]`	Path to the directory containing structures (.pdb or .cif) files.	required
`logdir`	`Path`	Path to the directory for log files.	required
`proteins_flag`	`bool`	Flag indicating whether proteins are used.	required
`max_seqs`	`int`	Maximum results per query sequence allowed to pass the prefilter for foldseek.	required
`ultra_sensitive`	`bool`	Whether to skip foldseek prefilter for maximum sensitivity	required
`extra_foldseek_params`	`str`	Extra foldseek search parameters	required
`custom_db`	`str`	Custom foldseek database	required
`foldseek_gpu`	`bool`	Use Foldseek-GPU acceleration and ungappedprefilter	required
`custom_annotations`	`Optional[Path]`	Path to the tsv containing the custom_db annotations, 2 columns	required
`has_duplicate_locus`	`bool`	If same locus tag has multiple annots (can happen in some euks)	required
`fast`	`bool`	If true, skips AFDB search	required

Returns:

Name	Type	Description
`bool`	`bool`	True if sub-databases are created successfully, False otherwise.

Source code in src/baktfold/subcommands/compare.py

def subcommand_compare(
    hypotheticals: Dict,
    output: Path,
    threads: int,
    evalue: float,
    sensitivity: float,
    database: Path,
    prefix: str,
    predictions_dir: Optional[Path],
    structures: bool,
    structure_dir: Optional[Path],
    logdir: Path,
    proteins_flag: bool,
    max_seqs: int,
    ultra_sensitive: bool,
    extra_foldseek_params: str,
    custom_db: str,
    foldseek_gpu: bool,
    custom_annotations: Optional[Path],
    has_duplicate_locus: bool,
    fast: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Compare 3Di or PDB structures to the baktfold DB

    Parameters:
        hypotheticals (Dict):  hypothetical features dictionary
        output (Path): Path to the output directory.
        threads (int): Number of threads to use.
        evalue (float): E-value threshold.
        card_vfdb_evalue (float): E-value threshold for CARD and VFDB databases.
        sensitivity (float): Sensitivity threshold.
        database (Path): Path to the reference database.
        prefix (str): Prefix for output files.
        predictions_dir (Optional[Path]): Path to the directory containing predictions.
        structures (bool): Flag indicating whether structures files are used.
        structure_dir (Optional[Path]): Path to the directory containing structures (.pdb or .cif) files.
        logdir (Path): Path to the directory for log files.
        proteins_flag (bool): Flag indicating whether proteins are used.
        max_seqs (int): Maximum results per query sequence allowed to pass the prefilter for foldseek.
        ultra_sensitive (bool): Whether to skip foldseek prefilter for maximum sensitivity
        extra_foldseek_params (str): Extra foldseek search parameters
        custom_db (str): Custom foldseek database
        foldseek_gpu (bool): Use Foldseek-GPU acceleration and ungappedprefilter
        custom_annotations (Optional[Path]): Path to the tsv containing the custom_db annotations, 2 columns 
        has_duplicate_locus (bool): If same locus tag has multiple annots (can happen in some euks)
        fast (bool): If true, skips AFDB search
    Returns:
        bool: True if sub-databases are created successfully, False otherwise.
    """


    # input predictions or structures
    if structures is False:
        # prostT5
        fasta_aa_input: Path = Path(predictions_dir) / f"{prefix}_aa.fasta"
        fasta_3di_input: Path = Path(predictions_dir) / f"{prefix}_3di.fasta"

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"

    ## copy the AA and 3Di from predictions directory 
    # if structures is false and baktfold compare is the command
    # Otherwise it will just copy itself

    if structures is False:
        if fasta_3di_input.exists():
            logger.info(
                f"Checked that the 3Di CDS file {fasta_3di_input} exists from baktfold predict"
            )
            if fasta_3di.exists() is False:
                with atomic_write_path(fasta_3di) as tmp:
                    shutil.copyfile(fasta_3di_input, tmp)
        else:
            logger.error(
                f"The 3Di CDS file {fasta_3di_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
            )
        # copy the aa to file
        if fasta_aa_input.exists():
            logger.info(
                f"Checked that the AA CDS file {fasta_aa_input} exists from baktfold predict."
            )
            if fasta_aa.exists() is False:
                with atomic_write_path(fasta_aa) as tmp:
                    shutil.copyfile(fasta_aa_input, tmp)
        else:
            logger.error(
                f"The AA CDS file {fasta_aa_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
                )

    ## write the AAs to file if structures is true because can't just copy from prediction_dir
    else:
        ## write the CDS to file
        logger.info(f"Writing the AAs to file {fasta_aa}.")

        with atomic_write_path(fasta_aa) as tmp_fasta, open(tmp_fasta, "w") as out_f:
            for entry in hypotheticals:
                seq_id = entry['id'] if has_duplicate_locus else entry['locus']
                out_f.write(f">{seq_id}\n{entry['aa']}\n")


    ############
    # create foldseek db
    ############

    foldseek_query_db_path: Path = Path(output) / "foldseek_db"
    foldseek_query_db_path.mkdir(parents=True, exist_ok=True)

    if structures is True:
        logger.info("Creating a foldseek query database from structures.")

        generate_foldseek_db_from_structures(
            fasta_aa,
            foldseek_query_db_path,
            structure_dir,
            logdir,
            prefix,
            proteins_flag,
        )
    else:
        generate_foldseek_db_from_aa_3di(
            fasta_aa, fasta_3di, foldseek_query_db_path, logdir, prefix
        )

    short_db_name = prefix

    # db search 

    database_name = "swissprot"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    #####
    # foldseek search
    #####

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_swissprot.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    swissprot_df = get_tophit(result_tsv, structures, cath=False)




    #####
    # foldseek search AFDB Clusters
    # by default yes, but not if no fast
    #####

    if not fast:

        database_name = "AFDBClusters"

        if short_db_name == database_name:
            logger.error(
                f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
            )

        query_db: Path = Path(foldseek_query_db_path) / short_db_name
        target_db: Path = Path(database) / database_name

        # make result and temp dirs
        result_db_base: Path = Path(output) / "result_db"
        result_db_base.mkdir(parents=True, exist_ok=True)
        result_db: Path = Path(result_db_base) / "result_afdb_db"

        temp_db: Path = Path(output) / "temp_db"
        temp_db.mkdir(parents=True, exist_ok=True)

        # make result tsv
        result_tsv: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"

        # run foldseek search
        run_foldseek_search(
            query_db,
            target_db,
            result_db,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )


        create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

        afdbclusters_df = get_tophit(result_tsv,structures, cath=False)

    else:
        logger.info("Skipping AFDB Clusters search as --fast specified.")

    #####
    # foldseek search pdb
    #####


    database_name = "pdb"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_pdb_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_pdb.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    pdb_df = get_tophit(result_tsv,structures, cath=False)


    #####
    # foldseek search cath
    #####


    database_name = "cath"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_cath_db"
    result_db_greedy_best_hits: Path = Path(result_db_base) / "result_cath_db_greedy_best_hits"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_cath.tsv"
    result_greedy_tsv: Path = Path(output) /  "foldseek_results_cath_greedy_tophit"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )

    # this keeps the greedy best hits for cath
    # we actually don't keep the single tophit - multidomain/fold proteins should have multiple non-overlapping CATH hits
    # this is equivalent to using --greedy-best-hits with foldseek easy-search
    summarise_hits(result_db, result_db_greedy_best_hits, logdir, threads)

    # saves all CATH hits first
    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)
    # save greedy CATH tophits
    create_result_tsv(query_db, target_db, result_db_greedy_best_hits, result_greedy_tsv, logdir, foldseek_gpu, structures, threads)

    # this just reads it in with appropriate headers
    cath_df = get_tophit(result_greedy_tsv, structures, cath=True)

    # write tophits
    swissprot_tophit_path: Path = Path(output) / "baktfold_swissprot_tophit.tsv"
    io.write_foldseek_tophit(swissprot_df, swissprot_tophit_path)

    if not fast:
        afdb_tophit_path: Path = Path(output) / "baktfold_afdbclusters_tophit.tsv"
        io.write_foldseek_tophit(afdbclusters_df, afdb_tophit_path)

    pdb_tophit_path: Path = Path(output) / "baktfold_pdb_tophit.tsv"
    io.write_foldseek_tophit(pdb_df, pdb_tophit_path)

    cath_tophit_path: Path = Path(output) / "baktfold_cath_tophit.tsv"
    io.write_foldseek_tophit(cath_df, cath_tophit_path)
    # remove result_greedy_tsv (identical to tophit, will make it confusing)
    remove_file(result_greedy_tsv) 

    # custom db output 

    #####
    # custom db
    #####


    if custom_db:

        try:

            logger.info(f"Foldseek will also be run against your custom database {custom_db}")
            # make result and temp dirs
            result_db_custom: Path = Path(result_db_base) / "result_db_custom"
            result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"

            run_foldseek_search(
            query_db,
            Path(custom_db),
            result_db_custom,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )

            create_result_tsv(query_db, Path(custom_db),
                result_db_custom,
                result_tsv_custom, logdir, foldseek_gpu, structures, threads)

            custom_df = get_tophit(result_tsv_custom,structures, cath=False)

            custom_db_tophit_path: Path = Path(output) / "baktfold_custom_db_tophit.tsv"
            io.write_foldseek_tophit(custom_df, custom_db_tophit_path)

        except Exception as e:
            logger.error(f"Foldseek failed to run against your custom database {custom_db}. Please check that it is formatted correctly as a Foldseek database: {e}")


    ####
    # covvert pipe back - proteins
    # in theory, from JSON, the pipe shouldn't happen as we only take the ID.

    ####

    """
    https://github.com/steineggerlab/foldseek/blob/8dc75c74ad0eddab73cfd905963d13bf74dc012b/lib/mmseqs/src/commons/Util.cpp#L142
    const struct Databases databases[] = {
            { "uc",   2, 0}, // Uniclust
            { "cl|",   3, 1},
            { "sp|",   3, 1}, // Swiss prot
            { "tr|",   3, 1}, // trembl
            { "gb|",   3, 1}, // GenBank
            { "ref|",  4, 1}, // NCBI Reference Sequence
            { "pdb|",  4, 1}, // Brookhaven Protein Data Bank
            { "bbs|",  4, 1}, // GenInfo Backbone Id
            { "lcl|",  4, 1}, // Local Sequence identifier
            { "pir||", 5, 1}, // NBRF PIR
            { "prf||", 5, 1}, // Protein Research Foundation
            { "gnl|",  4, 2}, // General database identifier
            { "pat|",  4, 2}, // Patents
            { "gi|",   3, 3}  // NCBI GI
    };
    """


    if proteins_flag:


        pipe_warning_logged = False

        new_hypotheticals = []
        for record in hypotheticals:
            rid = record["id"]
            if not pipe_warning_logged:
                if "~PIPE~" in rid:
                    pipe_warning_logged = True
                    logger.warning("At least one input protein has | in the header. Note these will be replaced with ~PIPE~ in the raw foldseek output if you chose --keep-tmp-files")
            record["id"] = record["id"].replace("~PIPE~", "|")
            record["locus"] = record["locus"].replace("~PIPE~", "|")
            new_hypotheticals.append(record)

        hypotheticals = new_hypotheticals
        del new_hypotheticals

        ###
        # overwrite the output aa and 3dis with | if they were in the input

        if pipe_warning_logged:
            replace_pipe_in_fasta(fasta_aa)
            if not structures:
                replace_pipe_in_fasta(fasta_3di)


    ####
    # lookup
    ####

    if proteins_flag: # baktfold proteins 

        # note aas passed as hypotheticals to the overall function - so in and out as aas

        aas = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus=False)
        if not fast:
            aas = pstc.parse(aas, afdbclusters_df, 'afdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, pdb_df, 'pdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, cath_df, 'cath', has_duplicate_locus=False)
        if custom_db:
            aas = pstc.parse(aas, custom_df, 'custom_db', has_duplicate_locus=False)

        # get the lookup descriptions for each of them
        # this requires the DB

        #aas = pstc.lookup(aas, Path(database), custom_annotations)
        aas = pstc.lookup_sql(aas, Path(database), threads)
        # add the custom annotations if it is provided
        if custom_annotations:
            aas = pstc.lookup_custom(aas, Path(database), custom_annotations)

        return aas

    else: # baktfold run

        # add the Swissprot and AFDB and PDB tophits to the json
        hypotheticals = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus)
        if not fast:
            hypotheticals = pstc.parse(hypotheticals, afdbclusters_df, 'afdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, pdb_df, 'pdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, cath_df, 'cath', has_duplicate_locus)
        if custom_db:
            hypotheticals = pstc.parse(hypotheticals, custom_df, 'custom_db', has_duplicate_locus)

        # get the lookup descriptions for each of them
        # hypotheticals = pstc.lookup(hypotheticals, Path(database), custom_annotations)
        hypotheticals = pstc.lookup_sql(hypotheticals, Path(database), threads)
        if custom_annotations:
            hypotheticals = pstc.lookup_custom(hypotheticals, Path(database), custom_annotations)

        return hypotheticals

`move_product_to_note_if_exists(qualifiers)`

If a 'product' qualifier exists, append it to 'note' and remove 'product'.

Designed for the eukaryotic entries

Parameters

dict

Feature qualifiers dictionary (values are usually lists).

Returns

None Modifies qualifiers in place.

Source code in src/baktfold/io/insdc.py

def move_product_to_note_if_exists(qualifiers):
    """
    If a 'product' qualifier exists, append it to 'note' and remove 'product'.

    Designed for the eukaryotic entries

    Parameters
    ----------
    qualifiers : dict
        Feature qualifiers dictionary (values are usually lists).

    Returns
    -------
    None
        Modifies qualifiers in place.
    """
    product = qualifiers.get("product")
    if not product:
        return

    # Ensure note exists and is a list
    if "note" not in qualifiers:
        qualifiers["note"] = []

    if isinstance(product, list):
        qualifiers["note"].extend(product)
    else:
        qualifiers["note"].append(product)

    qualifiers.pop("product", None)

`revise_dbxref_insdc(dbxrefs)`

Remove INSDC non-compliant DbXrefs.

Source code in src/baktfold/io/insdc.py

def revise_dbxref_insdc(dbxrefs: Sequence[str]) -> Tuple[Sequence[str], Sequence[str]]:
    """Remove INSDC non-compliant DbXrefs."""
    insdc_valid_dbxrefs = [bc.DB_XREF_UNIPROTKB, bc.DB_XREF_GO, bc.DB_XREF_PFAM, bc.DB_XREF_RFAM]
    valid_dbxrefs = []
    invalid_dbxrefs = []
    for dbxref in dbxrefs:
        if(dbxref.split(':')[0] in insdc_valid_dbxrefs):
            valid_dbxrefs.append(dbxref)
        else:
            invalid_dbxrefs.append(dbxref)
    return valid_dbxrefs, invalid_dbxrefs

`revise_product_insdc(product)`

Revise product name for INSDC compliant submissions

Source code in src/baktfold/io/insdc.py

def revise_product_insdc(product: str):
    """Revise product name for INSDC compliant submissions"""

    old_product = product
    if(re.search(r'(uncharacteri[sz]ed)', product, flags=re.IGNORECASE)):  # replace putative synonyms)
        product = re.sub(r'(uncharacteri[sz]ed)', 'putative', product, flags=re.IGNORECASE)
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('(') != product.count(')')):  # remove unbalanced parentheses
        product = product.replace('(', '').replace(')', '')  # ToDo: find and replace only legend parentheses
        logger.info('fix product: remove unbalanced parantheses. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('[') != product.count(']')):  # remove unbalanced brackets
        product = product.replace('[', '').replace(']', '')  # ToDo: find and replace only legend bracket
        logger.info('fix product: remove unbalanced brackets. new=%s, old=%s', product, old_product)

    return product

`parse_baktfold_json_for_reconstruction(input_path, euk_override=None, custom_db_override=None, fast_override=None, program_overrides=None)`

Parse a baktfold (or bakta) JSON and reconstitute every argument needed to re-run the output writers (genome -> write_bakta_outputs, proteins -> write_bakta_proteins_outputs). No database or Foldseek run is required.

Resolution order for the runtime flags that are not otherwise recoverable from feature data is: CLI override > JSON baktfold_run provenance block > inference from features > hard default.

Returns:

Name	Type	Description
`dict`		keyed by 'mode' ('genome' \| 'proteins') plus everything the
		matching writer needs.

Source code in src/baktfold/io/json_in.py

def parse_baktfold_json_for_reconstruction(
    input_path,
    euk_override=None,
    custom_db_override=None,
    fast_override=None,
    program_overrides=None,
):
    """Parse a baktfold (or bakta) JSON and reconstitute every argument needed to
    re-run the output writers (genome -> write_bakta_outputs, proteins ->
    write_bakta_proteins_outputs). No database or Foldseek run is required.

    Resolution order for the runtime flags that are not otherwise recoverable
    from feature data is: CLI override > JSON ``baktfold_run`` provenance block >
    inference from features > hard default.

    Returns:
        dict: keyed by 'mode' ('genome' | 'proteins') plus everything the
        matching writer needs.
    """
    program_overrides = program_overrides or {}

    # ---- validate & load -------------------------------------------------
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except Exception as e:
        logger.error(f'ERROR: annotation file {input_path} not valid! {e}')
        sys.exit(1)

    logger.info(f'Parsing baktfold JSON for reconstruction: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)

    if 'features' not in data:
        logger.error("Input JSON has no 'features' key - not a valid baktfold/bakta JSON.")
        sys.exit(1)

    features = data['features']
    version = data.get('version', {})
    provenance = data.get('baktfold_run', {})

    # ---- restore faithful cfg state (headers, version strings, timing) ---
    cfg.version = version.get('baktfold', cfg.version)
    cfg.db_version = version.get('baktfold_db', cfg.db_version)
    _restore_run_timing(data)

    # ---- detect genome vs proteins mode ----------------------------------
    mode = provenance.get('mode')
    if mode not in ('genome', 'proteins'):
        mode = 'genome' if 'sequences' in data else 'proteins'

    # ---- flags resolvable for both modes ---------------------------------
    custom_db = _first_set(custom_db_override, provenance.get('custom_db'), _infer_custom_db(features))
    if custom_db is None:
        custom_db = False
    # ``fast`` is NOT reliably inferable (absence of AFDB hits is ambiguous):
    # rely on provenance, else default off.
    fast = _first_set(fast_override, provenance.get('fast'))
    if fast is None:
        fast = False

    if mode == 'proteins':
        logger.info('Proteins-mode JSON detected.')
        return {
            'mode': 'proteins',
            'data': data,
            'aas': features,
            'features': features,
            'custom_db': bool(custom_db),
            'fast': bool(fast),
            'bakta_version': version,
        }

    # ---- genome mode -----------------------------------------------------
    if 'sequences' not in data:
        logger.error("Genome-mode JSON is missing the 'sequences' block; cannot reconstruct.")
        sys.exit(1)

    prokka = provenance.get('prokka')
    if prokka is None:
        prokka = 'prokka' in version
    other_genbank = provenance.get('other_genbank')
    if other_genbank is None:
        other_genbank = ('prokka' not in version) and ('bakta' not in version)

    euk = _first_set(euk_override, provenance.get('euk'), _infer_euk(features))
    if euk is None:
        euk = False

    translation_table = provenance.get('translation_table')
    if translation_table is None:
        translation_table = _coerce_translation_table((data.get('genome') or {}).get('translation_table'))

    has_duplicate_locus = provenance.get('has_duplicate_locus')
    if has_duplicate_locus is None:
        has_duplicate_locus = _detect_duplicate_locus(features)

    programs = {}
    for name, default in _DEFAULT_PROGRAMS.items():
        programs[name] = _first_set(program_overrides.get(name), provenance.get(name), default)

    # ---- rebuild features_by_sequence (preserving the JSON feature order,
    #      which is already start-sorted per sequence) and a flattened list
    #      consistent with it. Mirrors run/compare (skip discarded, <1.10.0
    #      'contig' fallback). -------------------------------------------------
    features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    for feature in features:
        if 'discarded' in feature:
            continue
        seq_id = feature['sequence'] if 'sequence' in feature else feature.get('contig')  # <1.10.0 compat
        if seq_id is None:
            logger.warning(f"Feature missing 'sequence', skipping: id={feature.get('id')}")
            continue
        bucket = features_by_sequence.get(seq_id)
        if bucket is None:
            logger.warning(f"Feature references unknown sequence '{seq_id}', skipping: id={feature.get('id')}")
            continue
        bucket.append(feature)

    flattened = []
    for seq in data['sequences']:
        flattened.extend(features_by_sequence[seq['id']])

    if other_genbank:
        log_for_other_genbank_tools(
            programs['cds_program'], programs['trna_program'], programs['rrna_program'],
            programs['tmrna_program'], programs['ncrna_program'],
        )

    logger.info(
        f"Reconstruction settings: euk={euk}, custom_db={bool(custom_db)}, fast={bool(fast)}, "
        f"prokka={prokka}, other_genbank={other_genbank}, translation_table={translation_table}, "
        f"has_duplicate_locus={has_duplicate_locus}"
    )

    return {
        'mode': 'genome',
        'data': data,
        'features': flattened,
        'features_by_sequence': features_by_sequence,
        'has_duplicate_locus': bool(has_duplicate_locus),
        'translation_table': translation_table,
        'prokka': bool(prokka),
        'other_genbank': bool(other_genbank),
        'euk': bool(euk),
        'custom_db': bool(custom_db),
        'fast': bool(fast),
        'cds_program': programs['cds_program'],
        'trna_program': programs['trna_program'],
        'rrna_program': programs['rrna_program'],
        'tmrna_program': programs['tmrna_program'],
        'ncrna_program': programs['ncrna_program'],
        'bakta_version': version,
    }

`parse_json_input(input_path, faa_path, all_proteins, protein_json_flag)`

Parses genome annotations from input JSON file.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Path to input JSON file.	required
`faa_path`	`str`	Path to output file for hypothetical proteins.	required
`all_proteins`	`bool`	Whether to keep all proteins or only hypothetical ones.	required
`protein_json_flag`	`bool`	Whether input is protein JSON	required

Returns:

Name	Type	Description
`tuple`		A tuple containing the data, features, and whether there are duplicate locus tags.

Examples:

>>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
(data, features, False, False)

Source code in src/baktfold/io/json_in.py

def parse_json_input(input_path, faa_path, all_proteins, protein_json_flag):
    """
    Parses genome annotations from input JSON file.

    Args:
      input_path (str): Path to input JSON file.
      faa_path (str): Path to output file for hypothetical proteins.
      all_proteins (bool): Whether to keep all proteins or only hypothetical ones.
      protein_json_flag (bool): Whether input is protein JSON

    Returns:
      tuple: A tuple containing the data, features, and whether there are duplicate locus tags.

    Examples:
      >>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
      (data, features, False, False)
    """



    ############################################################################
    # Checks and configurations
    # - check parameters and setup global configuration
    # - test database
    # - test binary dependencies
    ############################################################################

    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except Exception as e:
        logger.error(f'ERROR: annotation file {annotation_path} not valid! {e}')
        sys.exit(1)

    #print(f'baktfold v{cfg.version}')

    logger.info(f'Parsing annotations from input: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)


    features = data['features']

    # features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    # for feature in data['features']:
    #     seq_id = feature['sequence'] if 'sequence' in feature else feature['contig']  # <1.10.0 compatibility
    #     sequence_features = features_by_sequence.get(seq_id)
    #     sequence_features.append(feature)

    # keep all proteins
    if all_proteins:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS ]
    else:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]


    if protein_json_flag: # this will also be only hypotheticals if protein mode (or else why not just run with the FASTA)
        version = data.get("version", {})
        return features, hypotheticals, version


    # check if dupe locus tags (euks can have multiple CDS same locus tag e.g. Cladocopium goreaui CAMXCT020000001.1)
    seen_loci = set()
    has_duplicate_locus = False

    for feat in hypotheticals:
        locus = feat['locus']
        if locus in seen_loci:
            has_duplicate_locus = True
            logger.warning("Multiple CDS per locus tag were detected in your input JSON.")
            logger.warning("CDS id (which is unique) rather than locus tag will be used for ProstT5+Foldseek searches.")
            break
        seen_loci.add(locus)

    # this is done after getting all the sequences into the dict for baktfold proteins

    if has_duplicate_locus:
        # write hypothetical proteins to file with id (not locus) as guaranteed exists and unique
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['id']}\n{feat['aa']}\n")

    else:
        # write hypothetical proteins to file - almost always
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['locus']}\n{feat['aa']}\n")

    # none of this is relevant for proteins
    try:
        genome_block = data.get("genome")

        if genome_block is None:
            logger.error("No 'genome' block found in input JSON. Please check.")
            translation_table = None
        else:
            if "translation_table" not in genome_block:
                logger.error("No translation table found in input JSON. Please check your input.")
            else:
                raw_value = genome_block["translation_table"]

                try:
                    translation_table = int(raw_value)
                    logger.info(
                        f"Translation table {translation_table} detected from input JSON"
                    )

                except (ValueError, TypeError):
                    translation_table = str(raw_value)
                    logger.warning(
                        f"Translation table '{raw_value}' is not an integer. "
                        f"Parsing it as a string."
                    )

    except Exception as e:
        logger.exception(
            f"Unexpected error while parsing translation table: {e}"
        )
        translation_table = None

    # input detection

    version = data.get("version", {})

    prokka = False
    other_genbank = False

    if "prokka" in version:
        prokka = True
        logger.info("Prokka input detected")
    if  "prokka"  not in version and "bakta" not in version:
        other_genbank = True

    logger.info('Parsing complete')

    return data, features, has_duplicate_locus, translation_table, prokka, other_genbank, version

Module for manipulating genbank files some taken from phynteny https://github.com/susiegriggo/Phynteny

`get_genbank(genbank)`

Convert a GenBank file to a dictionary.

This function reads a GenBank file and converts it into a dictionary.

Parameters:

Name	Type	Description	Default
`genbank`	`Path`	Path to the GenBank file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the GenBank file.

Raises:

Type	Description
`ValueError`	If the provided file is not a GenBank file.

Source code in src/baktfold/io/handle_genbank.py

def get_genbank(genbank: Path) -> dict:
    """
    Convert a GenBank file to a dictionary.

    This function reads a GenBank file and converts it into a dictionary.

    Args:
        genbank (Path): Path to the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.

    Raises:
        ValueError: If the provided file is not a GenBank file.
    """

    logger.info(f"Checking if input {genbank} is a Genbank format file")
    logger.info(f"If so, also detecting the likely input style out of Pharokka, Bakta and NCBI Refseq style.")
    def parse_records(handle):
        """
    Parses a genbank file and returns a list of SeqRecords.

    Args:
      file_path (str): The path to the genbank file to parse.
      file_format (str): The format of the genbank file. Defaults to 'genbank'.

    Returns:
      list: A list of SeqRecords parsed from the genbank file.

    Examples:
      >>> parse_records('example.gb')
      [SeqRecord(seq=Seq('ATGC'), id='example', name='example', description='example', dbxrefs=[]), ...]
    """
        try:
            records = list(SeqIO.parse(handle, "gb"))
            if not records:
                return {}, None
            gb_dict = {record.id: record for record in records}
            record = records[0]

            comment = record.annotations.get("comment", "")
            cds_feature = next((f for f in record.features if f.type == "CDS"), None)

            if cds_feature is None:
                logger.error(f"{genbank} appears to be a Genbank formatted file but no CDS was found. Please check your input.")
                return gb_dict, None

            # Check if 'Bakta' appears in the Comment - will appear there
            if "Bakta" in comment and "locus_tag" in cds_feature.qualifiers:
                logger.info(f"Detected Bakta style input Genbank. Using locus_tag qualifier from Bakta as the CDS IDs for Phold.")
                method = "Bakta"
            else:
                if "phrog" not in cds_feature.qualifiers and "protein_id" in cds_feature.qualifiers:
                    logger.info(f"Detected NCBI Refseq style input Genbank. Using protein_id qualifier as the CDS IDs for Phold.")
                    method = "NCBI"
                elif "phrog" in cds_feature.qualifiers and "ID" in cds_feature.qualifiers:
                    logger.info(f"Detected Pharokka style input Genbank. Using ID qualifier from Pharokka as the CDS IDs for Phold.")
                    method = "Pharokka"
                else:
                    logger.error(
                        f"Feature {cds_feature} could not be parsed. Therefore, the input style format for {genbank} could not be detected. Please check your input."
                    )
                    # Bind ``method`` so the return below never raises
                    # ``UnboundLocalError`` (which the broad except would
                    # mislabel as "not a genbank file").
                    method = None
            return identify_long_ids(gb_dict), method
        except Exception as e:
            logger.warning(f"{genbank} is not a genbank file: {e}")
            return {}, None

    try:
        if is_gzip_file(genbank.strip()):
            with gzip.open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
        else:
            with open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
    except Exception as e:
        logger.warning(f"{genbank} is not a genbank file: {e}")
        return {}, None

`get_proteins(fasta)`

Convert an Amino Acid FASTA file to a dictionary.

This function reads a AA FASTA file and converts it into a dictionary.

Parameters:

Name	Type	Description	Default
`fasta`	`Path`	Path to the FASTA file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the FASTA file.

Raises:

Type	Description
`ValueError`	If the provided file is not a FASTA file.

Source code in src/baktfold/io/handle_genbank.py

def get_proteins(fasta: Path) -> dict:
    """
    Convert an Amino Acid FASTA file to a dictionary.

    This function reads a AA FASTA file and converts it into a dictionary.

    Args:
        fasta (Path): Path to the FASTA file.

    Returns:
        dict: A dictionary representation of the FASTA file.

    Raises:
        ValueError: If the provided file is not a FASTA file.
    """

    if is_gzip_file(fasta.strip()):
        try:
            fasta_dict = {}
            with gzip.open(fasta.strip(), "rt") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    else:
        try:
            fasta_dict = {}
            with open(fasta.strip(), "rt", errors="ignore") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    return fasta_dict

`identify_long_ids(gb_dict)`

Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

Parameters:

Name	Type	Description	Default
`dict`		A dictionary representation of the GenBank file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the GenBank file.

Source code in src/baktfold/io/handle_genbank.py

def identify_long_ids(gb_dict: dict) -> dict:
    """

    Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

    Args:
        dict: A dictionary representation of the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.
    """

    # remove spaces in ID/locus tag
    for record_id, record in gb_dict.items():
        for cds_feature in record.features:
            try:
                # if pharokka > 54 char IDs/locus tage, phold/biopython will parse with a space
                # no spaces in
                # for really long CDS IDs (over 54 chars), a space will be introduced
                # this is because the ID will go over a second line
                # weird bug noticed it on the Mgnify contigs annotated with Pharokka
                cds_id = cds_feature.qualifiers["ID"][0]
                if len(cds_id) >= 54:
                    logger.warning(
                        f"The CDS ID is {cds_id} is longer than 54 characters. It is recommended that you use short contig headers (which will therefore lead to shorter CDS ids)."
                    )
                    cds_feature.qualifiers["ID"][0] = cds_feature.qualifiers["ID"][
                        0
                    ].replace(" ", "")
            except Exception:
                # will be GenBank/NCBI formatted
                # ID isn't a field and should be properly formatted - famous last words probably
                continue

    return gb_dict

`is_gzip_file(f)`

Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

This is an elegant solution to test whether a file is gzipped by reading the first two characters. I also use a version of this in fastq_pair if you want a C version :) See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration

Parameters:

Name	Type	Description	Default
`f`	`Path`	The file to test.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the file is gzip compressed, otherwise False.

Source code in src/baktfold/io/handle_genbank.py

def is_gzip_file(f: Path) -> bool:
    """
    Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

    This is an elegant solution to test whether a file is gzipped by reading the first two characters.
    I also use a version of this in fastq_pair if you want a C version :)
    See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration
    Args:
        f (Path): The file to test.

    Returns:
        bool: True if the file is gzip compressed, otherwise False.
    """
    with open(f, "rb") as i:
        return binascii.hexlify(i.read(2)) == b"1f8b"

`open_protein_fasta_file(input_file)`

Open a fasta file, whether it is gzipped or plain text.

input_file (str): The path to the fasta file, either gzipped or plain.

Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.

Source code in src/baktfold/io/handle_genbank.py

def open_protein_fasta_file(input_file: str) -> Union[IO[str], gzip.GzipFile]:
    """
    Open a fasta file, whether it is gzipped or plain text.

    Parameters:
    input_file (str): The path to the fasta file, either gzipped or plain.

    Returns:
    Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.
    """
    input_file = Path(input_file)

    if input_file.suffix == ".gz":
        return gzip.open(input_file, "rt")
    else:
        return open(input_file, "r")

`add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None)`

Add optional INSDC qualifiers to a feature entry dict in Bakta style.

Parameters

dict

The feature dictionary being built.

dict

The qualifiers dictionary from Bio.SeqFeature.

set or list

Qualifiers expected to be single-valued (take the first if multiple).

set or list

Qualifiers that can have multiple values (keep as list if >1, else single value).

Source code in src/baktfold/io/eukaryotic_to_json.py

def add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None):
    """
    Add optional INSDC qualifiers to a feature entry dict in Bakta style.

    Parameters
    ----------
    entry : dict
        The feature dictionary being built.
    qualifiers : dict
        The qualifiers dictionary from Bio.SeqFeature.
    single_valued : set or list
        Qualifiers expected to be single-valued (take the first if multiple).
    multi_valued : set or list
        Qualifiers that can have multiple values (keep as list if >1, else single value).
    """

    single_valued = single_valued or set()
    multi_valued = multi_valued or set()

    # Multi-valued qualifiers
    for key in multi_valued:
        vals = qualifiers.get(key)
        if vals:
            entry[key] = vals if len(vals) > 1 else vals[0]

    # Single-valued qualifiers
    for key in single_valued:
        vals = qualifiers.get(key)
        if vals:
            if key == "locus_tag":
                entry["locus"] = vals[0] # this is what bakta needs
            else:
                entry[key] = vals[0]

`build_bakta_sequence_entry(rec)`

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/eukaryotic_to_json.py

def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 1 for euks
    if gcode is None:
        gcode = 1 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

`calc_genome_stats(records)`

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig GenBank file.

Source code in src/baktfold/io/eukaryotic_to_json.py

def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
     GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

`convert_assembly_gap_feature(feature, rec, id)`

Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The assembly_gap feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    strand = "." # bakta uses "." for strand on gaps
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    #  may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates


    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id,
    }

    # no need to add estimated length separately - it is covered by length in the json 

    # if est_len:
    #     gap_entry["estimated_length"] = est_len

    return gap_entry

`convert_cds_feature(feature, seq_record, translation_table, id)`

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    qualifiers = feature.qualifiers

    # ----------- Basic qualifiers -----------
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    note = qualifiers.get("note", [None])[0]
    locus = locus_tag

    # pseudo

    protein_id = qualifiers.get("protein_id", [None])[0]

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xref = qualifiers.get("db_xref", [so.SO_CDS.id])

    # Append so.SO_CDS.id only if it’s not already present
    if so.SO_CDS.id not in db_xref:
        db_xref.append(so.SO_CDS.id)

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": db_xref,  
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "note": note,
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
        "protein_id": protein_id
    }

# Feature Key           CDS

# Definition            coding sequence; sequence of nucleotides that
#                       corresponds with the sequence of amino acids in a
#                       protein (location includes stop codon); 
#                       feature includes amino acid conceptual translation.

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /codon_start=<1 or 2 or 3>
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /exception="[exception_value]"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /protein_id="<identifier>"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /ribosomal_slippage
#                       /standard_name="text"
#                       /translation="text"
#                       /transl_except=(pos:<location>,aa:<amino_acid>)
#                       /transl_table =<integer>
#                       /trans_splicing

    multi_valued = {"EC_number", "exception", "experiment", "function",  "gene_synonym",  "inference", }
    single_valued = {"allele", "artificial_location",  "map", "number",  "old_locus_tag", "operon", "phenotype", "pseudogene", "standard_name", "transl_except", "transl_table"}

    add_optional_qualifiers(bakta_cds, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "ribosomal_slippage", "trans_splicing"]:
        if flag in qualifiers:
            bakta_cds[flag] = flag in qualifiers

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

`convert_exon_feature(feature, rec, id)`

Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

Parameters

Bio.SeqFeature

The exon feature from the GenBank record.

Bio.SeqRecord

The full GenBank record.

str

Unique feature ID.

Returns

dict Bakta-style exon feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_exon_feature(feature, rec, id):
    """
    Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

    Parameters
    ----------
    feature : Bio.SeqFeature
        The exon feature from the GenBank record.
    rec : Bio.SeqRecord
        The full GenBank record.
    id : str
        Unique feature ID.

    Returns
    -------
    dict
        Bakta-style exon feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    db_xrefs = qualifiers.get("db_xref", [])

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    # Extract commonly used INSDC qualifiers
    exon_entry = {
            "type": "exon",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }

    multi_valued = {"EC_number","experiment","function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"   }

    add_optional_qualifiers(exon_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            exon_entry[flag] = True

    return exon_entry

`convert_gene_feature(feature, rec, id)`

Convert a Funannotate GenBank gene feature to Bakta-style JSON.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The rRNA feature from the GBK.	required
`rec`		str The record from the GBK.	required

Returns:

Name	Type	Description
`dict`		Bakta-style rRNA feature

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_gene_feature(feature, rec, id):
    """
    Convert a Funannotate GenBank gene feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")



    gene_entry = {
        "type": "gene",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "db_xrefs": [so.SO_GENE.id], 
        "id": id,
        "locus": locus_tag
    }


# Feature Key           gene 


# Definition            region of biological interest identified as a gene 
#                       and for which a name has been assigned;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /phenotype="text"
#                       /standard_name="text"
#                       /trans_splicing


# Comment               the gene feature describes the interval of DNA that 
#                       corresponds to a genetic trait or phenotype; the feature is,
#                       by definition, not strictly bound to it's positions at the 
#                       ends;  it is meant to represent a region where the gene is 
#                       located.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(gene_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            gene_entry[flag] = flag in qualifiers

    return gene_entry

`convert_mat_peptide_feature(feature, rec, id)`

Convert a mat_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mat_peptide_feature(feature, rec, id):
    """
    Convert a mat_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)


    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    so_code =  so.SO_MAT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    qualifiers = feature.qualifiers


    # Extract commonly used INSDC qualifiers
    mat_peptide_entry = {
            "type": "mat_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            # Join support
            "starts": starts,
            "stops": stops,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }


# Feature Key           mat_peptide


# Definition            mature peptide or protein coding sequence; coding
#                       sequence for the mature or final peptide or protein
#                       product following post-translational modification; the
#                       location does not include the stop codon (unlike the
#                       corresponding CDS);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"EC_number","experiment", "function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"}

    add_optional_qualifiers(mat_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers) - no flags
    # for flag in ["pseudo"]:
    #     if flag in qualifiers:
    #         mat_peptide_entry[flag] = True


    #  mat_peptide     complement(join(194724303..194724321,194744661..194744721,
    #                  194746996..194747031,194750435..194750476,
    #                  194757818..194757865,194759962..194760144,
    #                  194764890..194765087,194765856..194765944,
    #                  194767641..194767743,194768400..194768583))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /product="Membrane cofactor protein. /id=PRO_0000238971"
    #                  /note="propagated from UniProtKB/Swiss-Prot (O88174.1)"

    return mat_peptide_entry

`convert_misc_feature(feature, rec, id)`

Convert a misc feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_feature feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_misc_feature(feature, rec, id):
    """
    Convert a misc feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_feature feature.
    """

    seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_MISC_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    misc_feature_entry = {
            "type": "misc_feature",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": db_xrefs,


        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note", "phenotype"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name",  "pseudogene"}

    add_optional_qualifiers(misc_feature_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            misc_feature_entry[flag] = True

# Feature Key           misc_feature


# Definition            region of biological interest which cannot be described
#                       by any other feature key; a new or rare feature;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /phenotype="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               this key should not be used when the need is merely to 
#                       mark a region in order to comment on it or to use it in 
#                       another feature's location

    #  misc_feature    join(78488668..78488692,78499322..78499359)
    #                  /gene="Mogat1"
    #                  /gene_synonym="0610030A14Rik; 1110064N14Rik; Dgat2l;
    #                  Dgat2l1; mDC2; MGAT1; WI1-2612I11.1"
    #                  /note="propagated from UniProtKB/Swiss-Prot (Q91ZV4.2);
    #                  transmembrane region"

    #  misc_feature    78179419..78180585
    #                  /standard_name="Pax3 upstream hypaxial enhancer"
    #                  /note="Region: biological region; Derived by automated
    #                  computational analysis using gene prediction method:
    #                  RefSeqFE."
    #                  /function="regulatory_interactions: LOC107980439 | Pax3"
    #                  /db_xref="GeneID:107980442"    

    return misc_feature_entry

`convert_misc_rna_feature(feature, rec, id)`

Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The assembly_gap feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.

    """

        # from ensemble genomes
        # misc_RNA        complement(437333..442742)
        #             /gene="YPL060C-A"
        #             /note="transposable_element"
        #             /standard_name="YPL060C-A"

    # Coordinates (1-based)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]

# Feature Key           misc_RNA


# Definition            any transcript or RNA product that cannot be defined by
#                       other RNA keys (prim_transcript, precursor_RNA, mRNA,
#                       5'UTR, 3'UTR, exon, CDS, sig_peptide, transit_peptide,
#                       mat_peptide, intron, polyA_site, ncRNA, rRNA and tRNA);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    misc_rna_entry = {
        "type": "misc_RNA", # expects lowercase 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "gene": gene,
        "id": id
    }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "product", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(misc_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            misc_rna_entry[flag] = flag in qualifiers

    return misc_rna_entry

`convert_mobile_element_feature(feature, rec, id)`

Convert a GenBank mobile_element feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mobile_element_feature(feature, rec, id):
    """
    Convert a GenBank mobile_element feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier check (INSDC requirement)
    mobile_element_type = qualifiers.get("mobile_element_type", [None])[0]
    if mobile_element_type is None:
        raise ValueError(
            f"mobile_element feature {id} is missing mandatory "
            "/mobile_element_type qualifier"
        )

    so_code =  so.SO_MOBILE_ELEMENT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


# Feature Key           mobile_element


# Definition            region of genome containing mobile elements;

# Mandatory qualifiers  /mobile_element_type="<mobile_element_type>
#                       [:<mobile_element_name>]"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /standard_name="text"


    # Extract commonly used INSDC qualifiers
    mobile_element_entry = {
            "type": "mobile_element",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs,
                    # Mandatory
            "mobile_element_type": mobile_element_type,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "standard_name", "rpt_family", "rpt_type"}

    add_optional_qualifiers(mobile_element_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["pseudo"]:
    #   if flag in qualifiers:
    #     mobile_element_entry[flag] = True


    #  mobile_element  57369551..57369723
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: RefSeqFE."
    #                  /mobile_element_type="SINE:AmnSINE1"
    #                  /db_xref="GeneID:106707176"

    return mobile_element_entry

`convert_mrna_feature(feature, rec, id)`

Convert a funannotate mrna feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style mRNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mrna_feature(feature, rec, id):
    """
    Convert a funannotate mrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style mRNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    else:
        starts = None
        stops = None


    qualifiers = feature.qualifiers


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")


    mrna_entry = {
        "type": "mRNA", 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "db_xrefs": [so.SO_MRNA.id],        
        "id": id,
        "locus": locus_tag
    }



# Feature Key           mRNA


# Definition            messenger RNA; includes 5'untranslated region (5'UTR),
#                       coding sequences (CDS, exon) and 3'untranslated region
#                       (3'UTR);

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "artificial_location", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(mrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            mrna_entry[flag] = flag in qualifiers

    return mrna_entry

`convert_ncrna_feature(feature, rec, id)`

Convert a ncrna feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_ncrna_feature(feature, rec, id):
    """
    Convert a ncrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_NCRNA_GENE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Mandatory qualifier (INSDC requirement)
    ncrna_class = qualifiers.get("ncRNA_class", [None])[0]
    if ncrna_class is None:
        raise ValueError(
            f"ncRNA feature {id} is missing mandatory /ncRNA_class qualifier"
        )

    ncrna_entry = {
        "type": "ncRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Mandatory
        "ncRNA_class": ncrna_class,

        # Multi-valued qualifiers
        "db_xrefs": db_xrefs,


    }

# Feature Key           ncRNA

# Definition            a non-protein-coding gene, other than ribosomal RNA and
#                       transfer RNA, the functional molecule of which is the RNA
#                       transcript;

# Mandatory qualifiers  /ncRNA_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

# Example               /ncRNA_class="miRNA"
#                       /ncRNA_class="siRNA"
#                       /ncRNA_class="scRNA"       

# Comment               the ncRNA feature is not used for ribosomal and transfer
#                       RNA annotation, for which the rRNA and tRNA feature keys
#                       should be used, respectively;

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(ncrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            ncrna_entry[flag] = flag in qualifiers

    #  ncRNA           join(189791085..189791793,189798997..189799081,
    #                  189819873..189820364,189821703..189822337)
    #                  /ncRNA_class="lncRNA"
    #                  /gene="Gm30446"
    #                  /product="predicted gene, 30446, transcript variant X6"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: Gnomon. Supporting evidence
    #                  includes similarity to: 100% coverage of the annotated
    #                  genomic feature by RNAseq alignments, including 2 samples
    #                  with support for all annotated introns"
    #                  /transcript_id="XR_001779629.1"
    #                  /db_xref="GeneID:102632350"
    #                  /db_xref="MGI:MGI:5589605"

    return ncrna_entry

`convert_precursor_rna_feature(feature, rec, id)`

Convert a GenBank precursor_RNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_precursor_rna_feature(feature, rec, id):
    """
    Convert a GenBank precursor_RNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_PRECURSOR_RNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    precursor_rna_entry = {
            "type": "precursor_RNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(precursor_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["trans_splicing"]:
        if flag in qualifiers:
            precursor_rna_entry[flag] = True

#     Feature Key           precursor_RNA


# Definition            any RNA species that is not yet the mature RNA product;
#                       may include ncRNA, rRNA, tRNA, 5' untranslated region
#                       (5'UTR), coding sequences (CDS, exon), intervening
#                       sequences (intron) and 3' untranslated region (3'UTR);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"  
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /standard_name="text"
#                       /trans_splicing


    #  precursor_RNA   194719348..194719428
    #                  /gene="Mir29b-2"
    #                  /gene_synonym="mir-29b-2; Mirn29b-2"
    #                  /product="microRNA 29b-2"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: BestRefSeq."
    #                  /transcript_id="NR_029809.1"
    #                  /db_xref="GeneID:723963"
    #                  /db_xref="MGI:MGI:3619047"
    #                  /db_xref="miRBase:MI0000712"

    return precursor_rna_entry

`convert_proprotein_propeptide_feature(feature, rec, id)`

Convert a proprotein or propeptide feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style proprotein feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_proprotein_propeptide_feature(feature, rec, id):
    """
    Convert a proprotein or propeptide feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style proprotein feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_PROPEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    propeptide_entry = {
        "type": "propeptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }


# Feature Key           propeptide


# Definition            propeptide coding sequence; coding sequence for the domain of a 
#                       proprotein that is cleaved to form the mature protein product.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(propeptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            propeptide_entry[flag] = True

    #  proprotein      join(171053237..171053367,171053712..171053832)
    #                  /gene="Apoa2"
    #                  /gene_synonym="Alp-2; Apo-AII; Apoa-2; ApoA-II; ApoAII;
    #                  Hdl-1"
    #                  /product="apolipoprotein A-II proprotein"  

    return propeptide_entry

`convert_protein_bind_feature(feature, rec, id)`

Convert a GenBank protein_bind feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_protein_bind_feature(feature, rec, id):
    """
    Convert a GenBank protein_bind feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    bound_moiety = qualifiers.get("bound_moiety", [None])[0]
    if bound_moiety is None:
        raise ValueError(
            f"protein_bind feature {id} is missing mandatory /bound_moiety qualifier"
        )

    so_code =  so.SO_PROTEINBIND.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    protein_bind_entry = {
        "type": "protein_bind",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "bound_moiety": bound_moiety,
        "db_xrefs": db_xrefs,
        "id": id,
    }


# Feature Key           protein_bind


# Definition            non-covalent protein binding site on nucleic acid;

# Mandatory qualifiers  /bound_moiety="text"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /standard_name="text"

# Comment               note that feature key regulatory with /regulatory_class="ribosome_binding_site"
#                       should be used for ribosome binding sites.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(protein_bind_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["trans_splicing"]:
    #     if flag in qualifiers:
    #         protein_bind_entry[flag] = True

    return protein_bind_entry

`convert_regulatory_feature(feature, rec, id)`

Convert a GenBank regulatory feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_regulatory_feature(feature, rec, id):
    """
    Convert a GenBank regulatory feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    regulatory_class = qualifiers.get("regulatory_class", [None])[0]
    if regulatory_class is None:
        raise ValueError(
            f"regulatory feature {id} is missing mandatory /regulatory_class qualifier"
        )

    so_code =  so.SO_REGULATORY_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    regulatory_entry = {
            "type": "regulatory",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "regulatory_class": regulatory_class,
            "db_xrefs": db_xrefs,
            "id": id,
        }


# Feature Key           regulatory


# Definition            any region of sequence that functions in the regulation of
#                       transcription, translation, replication, recombination, or chromatin structure;

# Mandatory qualifiers  /regulatory_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /bound_moiety="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /phenotype="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment	              This feature has replaced the following Feature Keys on 15-DEC-2014:
#                       enhancer, promoter, CAAT_signal, TATA_signal, -35_signal, -10_signal,
#                       RBS, GC_signal, polyA_signal, attenuator, terminator, misc_signal.

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "bound_moiety", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(regulatory_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            regulatory_entry[flag] = True

    #  regulatory      195030925..195032349
    #                  /regulatory_class="enhancer"
    #                  /experiment="EXISTENCE:reporter gene assay evidence
    #                  [ECO:0000049][PMID:32912294]"
    #                  /note="C2 STARR-seq-only enhancer starr_03508"
    #                  /function="activates a minimal SCP1 promoter by STARR-seq
    #                  in ground-state (2iL) and metastable (SL) mouse embryonic
    #                  stem cells {active_cell/tissue: mESC(E14 +2i+LIF or
    #                  +serum+LIF)}"
    #                  /db_xref="GeneID:131296982"

    return regulatory_entry

`convert_repeat_region_feature(feature, rec, id)`

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    strand = "."
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           repeat_region


# Definition            region of genome containing repeating units;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /rpt_unit_range=<base_range>
#                       /rpt_unit_seq="text"
#                       /satellite="<satellite_type>[:<class>][ <identifier>]"
#                       /standard_name="text"

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Minimal Bakta-like CRISPR structure
    repeat_region_entry = {
        "type": "repeat_region",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "LINE1" - should always be there
        "rpt_type": rpt_type,   
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat if crispr
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": db_xrefs
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"satellite", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(repeat_region_entry, qualifiers, single_valued, multi_valued)


    return repeat_region_entry

`convert_rrna_feature(feature, rec, id)`

Convert a GenBank rRNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_rrna_feature(feature, rec, id):
    """
    Convert a GenBank rRNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_RRNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

# Feature Key           rRNA


# Definition            mature ribosomal RNA; RNA component of the
#                       ribonucleoprotein particle (ribosome) which assembles
#                       amino acids into proteins.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               rRNA sizes should be annotated with the /product
#                       qualifier.  


    rrna_entry = {
            "type": "rRNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(rrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            rrna_entry[flag] = True


    #  rRNA            46413357..46413475
    #                  /gene="n-R5s211"
    #                  /product="5S ribosomal RNA"
    #                  /inference="COORDINATES: nucleotide
    #                  motif:Rfam:12.0:RF00001"
    #                  /inference="COORDINATES: profile:INFERNAL:1.1.1"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: cmsearch."
    #                  /transcript_id="XR_004936691.1"
    #                  /db_xref="GeneID:115487577"
    #                  /db_xref="RFAM:RF00001"
    #                  /db_xref="MGI:MGI:4422076"

    return rrna_entry

`convert_sig_peptide_feature(feature, rec, id)`

Convert a sig_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style sig_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_sig_peptide_feature(feature, rec, id):
    """
    Convert a sig_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style sig_peptide feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_SIGNAL_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    sig_peptide_entry = {
        "type": "sig_peptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }




# Feature Key           sig_peptide


# Definition            signal peptide coding sequence; coding sequence for an
#                       N-terminal domain of a secreted protein; this domain is
#                       involved in attaching nascent polypeptide to the
#                       membrane leader sequence;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(sig_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            sig_peptide_entry[flag] = True


    #  sig_peptide     complement(join(194768584..194768588,
    #                  194774407..194774533))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /inference="COORDINATES: ab initio prediction:SignalP:6.0"

    return sig_peptide_entry

`convert_transit_peptide_feature(feature, rec, id)`

Convert a transit_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style transit_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_transit_peptide_feature(feature, rec, id):
    """
    Convert a transit_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style transit_peptide feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_TRANSIT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    transit_peptide_entry = {
            "type": "transit_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": qualifiers.get("db_xref", []),

        }



# Feature Key           transit_peptide


# Definition            transit peptide coding sequence; coding sequence for an
#                       N-terminal domain of a nuclear-encoded organellar
#                       protein; this domain is involved in post-translational
#                       import of the protein into the organelle;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(transit_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            transit_peptide_entry[flag] = True

    #  transit_peptide complement(join(180006550..180006849,
    #                  180009627..180009803))
    #                  /gene="Coq8a"
    #                  /gene_synonym="4632432J16Rik; Adck3; Cabc1; mKIAA0451"
    #                  /note="Mitochondrion.
    #                  /evidence=ECO:0000250|UniProtKB:Q8NI60; propagated from
    #                  UniProtKB/Swiss-Prot (Q60936.2)"

    return transit_peptide_entry

`convert_trna_feature(feature, seq_record, id)`

Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_trna_feature(feature, seq_record, id):
    """
    Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))



    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]

    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna_entry = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in output
        "locus": locus_tag,
        "id": id,
    }

# Feature Key           tRNA


# Definition            mature transfer RNA, a small RNA molecule (75-85 bases
#                       long) that mediates the translation of a nucleic acid
#                       sequence into an amino acid sequence;

# Optional qualifiers   /allele="text"
#                       /anticodon=(pos:<location>,aa:<amino_acid>,seq:<text>)
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",    "old_locus_tag", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(bakta_trna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            bakta_trna_entry[flag] = flag in qualifiers

    return bakta_trna_entry

`convert_utr_region_feature(feature, rec, id, three)`

Convert a UTR GenBank feature to a simplified Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The UTR feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_utr_region_feature(feature, rec, id, three):
    """
    Convert a UTR GenBank feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The UTR feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style feature.
    """

    if three:
        type = "3'UTR"
        so_code =  so.SO_3UTR.id
    else:
        type = "5'UTR"
        so_code =  so.SO_5UTR.id

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # always just take the positive strand to get the NT seq (UTR region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           3'UTR


# Definition            1) region at the 3' end of a mature transcript (following 
#                       the stop codon) that is not translated into a protein;
#                       2) region at the 3' end of an RNA virus (following the last stop
#                       codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing



# Feature Key           5'UTR


# Definition            1) region at the 5' end of a mature transcript (preceding 
#                       the initiation codon) that is not translated into a protein;
#                       2) region at the 5' end of an RNA virus genome (preceding the first 
#                       initiation codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    # Minimal Bakta-like structure
    utr_entry = {
        "type": type,
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "product": note, 
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        "db_xrefs": db_xrefs,
        "locus": locus_tag
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",   "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(utr_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            utr_entry[flag] = flag in qualifiers

    return utr_entry

`get_bakta_style_id_from_locus_tag(records)`

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/eukaryotic_to_json.py

def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first  record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 7:

                        locus_tag_prefix = locus_tag[:-7] # trims off _000001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default  locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

`random_n_letter_id(n=4)`

generates a n letter id prefix

n=2 to append to locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/eukaryotic_to_json.py

def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to   locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

`write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, cds_program, trna_program, rrna_program, tmrna_program, ncrna_program, bakta_version)`

Writes the bakta outputs to a given path.

Parameters:

Name	Type	Description	Default
`data`	`dict`	The dictionary containing the bakta outputs.	required
`features`	`Sequence[dict]`	The sequence of dictionaries containing the features.	required
`features_by_sequence`	`Sequence[dict]`	The sequence of dictionaries containing the features by sequence.	required
`output`	`Path`	The path to save the bakta outputs to.	required
`prefix`	`str`	The prefix to use for the bakta outputs.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`euk`	`bool`	A boolean indicating whether the sequences are eukaryotic.	required
`has_duplicate_locus`	`bool`	A boolean indicating whether there are duplicate loci.	required
`fast`	`bool`	If True, skips AFDB step	required
`translation_table`	`str`	Translation table inferred from input JSON	required
`prokka`	`bool`	boolean indicating if prokka was used to do initial annotation	required
`other_genbank`	`bool`	boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation	required
`bakta_version`	`dict`	Dictionary of Bakta (or whatever other program) was used for the initial annotation	required

Returns:

Type	Description
	None.

Examples:

>>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)

Source code in src/baktfold/io/io.py

def write_bakta_outputs(data: dict, features: Sequence[dict], features_by_sequence: Sequence[dict] , 
                        output: Path, prefix: str, custom_db: bool, euk: bool, has_duplicate_locus: bool,
                        fast: bool, translation_table: int, prokka: bool, other_genbank: bool,
                        cds_program: str ,trna_program: str, rrna_program: str, tmrna_program: str, ncrna_program: str, bakta_version: dict):
    """
    Writes the bakta outputs to a given path.

    Args:
      data (dict): The dictionary containing the bakta outputs.
      features (Sequence[dict]): The sequence of dictionaries containing the features.
      features_by_sequence (Sequence[dict]): The sequence of dictionaries containing the features by sequence.
      output (Path): The path to save the bakta outputs to.
      prefix (str): The prefix to use for the bakta outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      euk (bool): A boolean indicating whether the sequences are eukaryotic.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): If True, skips AFDB step
      translation_table (str): Translation table inferred from input JSON
      prokka (bool): boolean indicating if prokka was used to do initial annotation
      other_genbank (bool): boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation
      bakta_version (dict): Dictionary of Bakta (or whatever other program) was used for the initial annotation

    Returns:
      None.

    Examples:
      >>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
    """

    #logger.info(f'selected features={len(features)}')

    logger.info('writing human readable TSV...')
    tsv_path: Path = Path(output) / f"{prefix}.tsv"
    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)

    logger.info('writing GFF3...')
    gff3_path: Path = Path(output) / f"{prefix}.gff3"
    # fix later prokka
    gff.write_features(data, features_by_sequence, gff3_path, prokka, euk, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing INSDC GenBank & EMBL...')
    genbank_path: Path = Path(output) / f"{prefix}.gbff"
    embl_path: Path = Path(output) / f"{prefix}.embl"
    # arg order must match write_features(...): translation_table BEFORE other_genbank.
    insdc.write_features(data, features, genbank_path, embl_path, prokka, euk, translation_table, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing genome sequences...')
    fna_path: Path = Path(output) / f"{prefix}.fna"
    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)

    logger.info('writing feature nucleotide sequences...')
    ffn_path: Path = Path(output) / f"{prefix}.ffn"
    fasta.write_ffn(features, ffn_path)

    logger.info('writing translated CDS sequences...')
    faa_path: Path = Path(output) / f"{prefix}.faa"
    fasta.write_faa(features, faa_path)

    # inference here is the different databases?
    annotations_path: Path = Path(output) / f"{prefix}.inference.tsv"
    if custom_db:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    # Remove 'AFDBClusters' if fast is True
    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # annotation confidence column goes right after Product
    prod_idx = header_columns.index('Product')
    header_columns = header_columns[:prod_idx + 1] + ['Annotation_Confidence'] + header_columns[prod_idx + 1:]

    # flatten all features across sequences
    all_features = [
        feat
        for features in features_by_sequence.values()
        for feat in features
    ]

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')

    selected_features = []


    for seq_id, features in features_by_sequence.items():
        for feat in features:
            # get() ensures we don't crash if the key doesn't exist
            if 'hypothetical' in feat or 'baktfold' in feat:
                selected_features.append(feat)

    # structure input carries Foldseek TM-score/LDDT on its hits -> extra columns
    structures = any('tmscore' in feat for feat in selected_features)
    if structures:
        header_columns = header_columns + ['TMscore', 'LDDT']

    tsv.write_protein_features(selected_features, header_columns, annotations_path, custom_db, has_duplicate_locus, fast=fast, structures=structures)

    # write summary file

    write_summary_txt_file(output, prefix, all_features)

    logger.info('write machine readable JSON...')
    json_path: Path = Path(output) / f"{prefix}.json"
    # Provenance block: lets ``baktfold json`` reconstitute these outputs later
    # without re-supplying runtime flags. See io/json.py:write_json.
    baktfold_run = {
        'mode': 'genome',
        'euk': euk,
        'custom_db': custom_db,
        'fast': fast,
        'has_duplicate_locus': has_duplicate_locus,
        'translation_table': translation_table,
        'prokka': prokka,
        'other_genbank': other_genbank,
        'cds_program': cds_program,
        'trna_program': trna_program,
        'rrna_program': rrna_program,
        'tmrna_program': tmrna_program,
        'ncrna_program': ncrna_program,
    }
    json.write_json(data, features, json_path, bakta_version, baktfold_run)

`write_bakta_proteins_outputs(aas, output, prefix, custom_db, fast, bakta_version)`

Writes the bakta protein outputs to a given path.

Parameters:

Name	Type	Description	Default
`aas`	`Sequence[dict]`	The sequence of dictionaries containing the amino acids.	required
`output`	`Path`	The path to save the bakta protein outputs to.	required
`prefix`	`str`	The prefix to use for the bakta protein outputs.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`fast`	`bool`	If True, skips AFDB step	required
`bakta_version`	`dict`	Original Bakta version	required

Returns:

Type	Description
	None.

Examples:

>>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)

Source code in src/baktfold/io/io.py

def write_bakta_proteins_outputs(aas: Sequence[dict], output: Path, prefix: str, custom_db: bool, fast: bool, bakta_version: dict):
    """
    Writes the bakta protein outputs to a given path.

    Args:
      aas (Sequence[dict]): The sequence of dictionaries containing the amino acids.
      output (Path): The path to save the bakta protein outputs to.
      prefix (str): The prefix to use for the bakta protein outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      fast (bool): If True, skips AFDB step
      bakta_version (dict): Original Bakta version

    Returns:
      None.

    Examples:
      >>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
    """

    # remove fields that were mocked to avoid baktfold crashing but not in the bakta protein JSON outputs
    fields_to_remove = ['sequence', 'start', 'stop', 'strand', 'frame']

    for aa in aas:
        for f in fields_to_remove:
            aa.pop(f, None)

    annotations_path: Path = Path(output) / f"{prefix}.tsv"
    if custom_db:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # annotation confidence column goes right after Product
    prod_idx = header_columns.index('Product')
    header_columns = header_columns[:prod_idx + 1] + ['Annotation_Confidence'] + header_columns[prod_idx + 1:]

    # structure input carries Foldseek TM-score/LDDT on its hits -> extra columns
    structures = any('tmscore' in aa for aa in aas)
    if structures:
        header_columns = header_columns + ['TMscore', 'LDDT']

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')
    tsv.write_protein_features(aas, header_columns, annotations_path, custom_db, has_duplicate_locus=False, fast=fast, structures=structures)


    # do i combine the tophits tsvs, sort by column, add a column for db and put out as one tsv

    full_annotations_path: Path = Path(output) / f"{prefix}.json"
    logger.info(f'Full annotations (JSON): {full_annotations_path}')
    # Provenance block for ``baktfold json`` (proteins mode). See io/json.py.
    baktfold_run = {
        'mode': 'proteins',
        'custom_db': custom_db,
        'fast': fast,
    }
    json.write_json({'features': aas}, aas, full_annotations_path, bakta_version, baktfold_run)


    #### don't write hyps I think as tsv

    # hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
    # header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
    # hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    # print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}')
    # tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)

    aa_output_path: Path = Path(output) / f"{prefix}.faa"
    logger.info(f'Annotated sequences (Fasta): {aa_output_path}')
    fasta.write_faa(aas, aa_output_path)

    write_summary_txt_file(output, prefix, aas)

`write_foldseek_tophit(tophit_df, pdb_tophit_path)`

Writes the foldseek tophits to a given path.

Parameters:

Name	Type	Description	Default
`tophit_df`	`pl.DataFrame`	The dataframe containing the foldseek tophits.	required
`pdb_tophit_path`	`Path`	The path to save the foldseek tophits to.	required

Returns:

Type	Description
	None.

Examples:

>>> write_foldseek_tophit(tophit_df, pdb_tophit_path)

Source code in src/baktfold/io/io.py

def write_foldseek_tophit(tophit_df: pl.DataFrame, pdb_tophit_path: Path):
    """
    Writes the foldseek tophits to a given path.

    Args:
      tophit_df (pl.DataFrame): The dataframe containing the foldseek tophits.
      pdb_tophit_path (Path): The path to save the foldseek tophits to.

    Returns:
      None.

    Examples:
      >>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
    """
    logger.info(f"Saving foldseek tophits to {pdb_tophit_path}")
    tophit_df.write_csv(pdb_tophit_path, separator="\t")

`map_aa_columns(feat, custom_db, has_duplicate_locus, fast, structures=False)`

Maps amino acid columns.

Parameters:

Name	Type	Description	Default
`feat`	`dict`	The dictionary containing the features.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`has_duplicate_locus`	`bool`	A boolean indicating whether there are duplicate loci.	required
`fast`	`bool`	A boolean indicating whether AFDBclusters Foldseek search should be skipped	required

Returns:

Type	Description
`Sequence[str]`	Sequence[str]: A sequence of strings containing the mapped amino acid columns.

Examples:

>>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
['ABC', '100', 'protein', '', '', '', '']

Source code in src/baktfold/io/tsv.py

def map_aa_columns(feat: dict, custom_db: bool, has_duplicate_locus: bool, fast: bool, structures: bool = False) -> Sequence[str]:
    """
    Maps amino acid columns.

    Args:
      feat (dict): The dictionary containing the features.
      custom_db (bool): A boolean indicating whether a custom database is used.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): A boolean indicating whether AFDBclusters Foldseek search should be skipped

    Returns:
      Sequence[str]: A sequence of strings containing the mapped amino acid columns.

    Examples:
      >>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
      ['ABC', '100', 'protein', '', '', '', '']
    """
    # Ensure length exists
    if 'length' not in feat:
        feat['length'] = int(len(feat['nt']) / 3)

    xrefs = feat.get('db_xrefs', [])

    # Extract dbxref groups once
    def join_filtered(prefix: str, replacement: str = None):
        """
    Joins filtered database cross-references.

    Args:
      prefix (str): The prefix to filter by.
      replacement (str): The string to replace the prefix with. Defaults to None.

    Returns:
      str: The joined filtered database cross-references.

    Examples:
      >>> join_filtered('swissprot', 'afdb_v6:')
      'afdb_v6:'
    """
        if replacement is None:
            replacement = prefix
        return ','.join(
            db.replace(replacement, '') for db in xrefs
            if prefix in db
        )

    swissprot   = join_filtered('swissprot', 'afdb_v6:')
    afdbclust   = join_filtered('afdbclusters_', 'afdb_v6:')
    pdb         = join_filtered('pdb:')
    cath        = join_filtered('cath:')
    custom_refs = join_filtered('custom:', 'custom:custom_')

    # Build the output row
    row = [feat['locus']]

    # add id if multiple CDS per Locus in that record (euks)
    if has_duplicate_locus:
        row.append(feat['id'])

    row.extend([
        str(feat['length']),
        feat['product'],
        feat.get('annotation_confidence', ''),
        swissprot,
    ])

    # Only add AFDBClusters if not in fast mode
    if not fast:
        row.append(afdbclust)

    # Always add these
    row.extend([
        pdb,
        cath,
    ])

    if custom_db:
        row.append(custom_refs)

    # structure input also reports Foldseek TM-score and LDDT
    if structures:
        row.extend([
            '' if feat.get('tmscore') is None else str(feat['tmscore']),
            '' if feat.get('lddt') is None else str(feat['lddt']),
        ])

    return row

`write_feature_inferences(sequences, features_by_sequence, tsv_path)`

Export feature inference statistics in TSV format.

Source code in src/baktfold/io/tsv.py

def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export feature inference statistics in TSV format."""
    logger.info('write tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                    if('ups' in feat or 'ips' in feat):
                        query_cov = 1
                        subject_cov = 1
                        identity = 1
                        evalue = 0
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][DB_UPS_COL_UNIPARC]}"
                    elif('psc' in feat or 'pscc' in feat):
                        psc_type = 'psc' if 'psc' in feat else 'pscc'
                        query_cov = feat[psc_type]['query_cov']
                        subject_cov = feat[psc_type].get('subject_cov', -1)
                        identity = feat[psc_type]['identity']
                        score = feat[psc_type].get('score', -1)
                        evalue = feat[psc_type].get('evalue', -1)
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][DB_PSCC_COL_UNIREF50]}"
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'],
                            f"{score:0.1f}" if score != None else '-',
                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'] if 'locus' in feat else '-',
                            f"{feat['score']:0.1f}",
                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
    return

`write_features(sequences, features_by_sequence, tsv_path)`

Export features in TSV format.

Source code in src/baktfold/io/tsv.py

def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export features in TSV format."""
    logger.info(f'write feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                feat_type = feat['type']
                if(feat_type == bc.FEATURE_GAP):
                    feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP

                gene = feat['gene'] if feat.get('gene', None) else ''
                product = feat.get('product', '')
                if(bc.PSEUDOGENE in feat):
                    product = f"(pseudo) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_5_PRIME):
                    product = f"(5' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_3_PRIME):
                    product = f"(3' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_BOTH):
                    product = f"(partial) {product}"

                def s(x):
                    return '' if x is None else str(x)

                fh.write('\t'.join(
                    [
                        seq_id,
                        feat_type,
                        str(feat['start']),
                        str(feat['stop']),
                        str(feat['strand']),
                        s(feat.get('locus')), # handles None → ''
                        s(gene),        # handles None → ''
                        s(product),     # handles None → ''
                        ', '.join(sorted(feat.get('db_xrefs', [])))
                    ])
                )
                fh.write('\n')
                if(feat_type == bc.FEATURE_CRISPR):
                    i = 0
                    # spacers and repeats wont exist if Prokka input
                    spacers = feat.get('spacers', [])
                    repeat = feat.get('repeat', [])

                    if len(spacers) > 0 and len(repeat) > 0: 
                    # if not - will just skip
                        while i < len(feat['spacers']):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
                            spacer = feat['spacers'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                            fh.write('\n')
                            i += 1
                        if(len(feat['repeats']) - 1 == i):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
    return

`write_hypotheticals(hypotheticals, tsv_path)`

Export hypothetical information in TSV format.

Source code in src/baktfold/io/tsv.py

def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
    """Export hypothetical information in TSV format."""
    logger.info('write hypothetical tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold v{cfg.version}, https://github.com/oschwengers/bakta\n')
        #fh.write(f"#Database v{cfg.db_info['major']}.{cfg.db_info['minor']}, https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('#Sequence Id\tStart\tStop\tStrand\tLocus Tag\tMol Weight [kDa]\tIso El. Point\tPfam hits\tDbxrefs\n')
        for hypo in hypotheticals:
            pfams = [f"{pfam['id']}|{pfam['name']}" for pfam in hypo.get('pfams', [])]
            seq_stats = hypo['seq_stats']
            mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA'
            iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA'
            seq_id = hypo['sequence'] if 'sequence' in hypo else hypo['contig']  # <1.10.0 compatibility
            fh.write(f"{seq_id}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
    return

`write_protein_features(features, header_columns, tsv_path, custom_db, has_duplicate_locus, fast, structures=False)`

Export protein features in TSV format.

Source code in src/baktfold/io/tsv.py

def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], tsv_path: Path, custom_db: bool, has_duplicate_locus: bool, fast: bool, structures: bool = False):
    """Export protein features in TSV format."""
    logger.info(f'write protein feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold (v{cfg.version}): https://github.com/gbouras13/baktfold\n')
        #fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('\t'.join(header_columns))
        fh.write('\n')
        for feat in features:
            columns = map_aa_columns(feat, custom_db, has_duplicate_locus, fast, structures)
            fh.write('\t'.join(columns))
            fh.write('\n')
    return

`parse_protein_input(input_path, faa_path)`

handles regular FASTA and gzipped returns cds_dict

Source code in src/baktfold/io/fasta_in.py

def parse_protein_input(input_path, faa_path):
    """
    handles regular FASTA and gzipped 
    returns cds_dict
    """

    # handles regular FASTA and gzipped
    fasta_flag = False
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        input_path = Path(input_path).resolve()
        fasta_flag = is_fasta(input_path)
        if fasta_flag:
            logger.info('FASTA input format detected.')
        else:
            logger.info('Bakta JSON input format detected. Only hypothetical proteins from the Bakta JSON input file will be annotated.')
    except Exception as e:
        logger.error(f'ERROR: annotation file {input_path} not valid! {e}')
        sys.exit(1)

    aas = []
    hypotheticals = []
    bakta_version = {}
    try:
        if fasta_flag:
            logger.info('Attempting to parse input protein sequences as .faa format ...')
            aas = fasta.import_sequences(input_path, False, False)
        else:
            logger.info('Attempting to parse input protein sequences as Bakta JSON format ...')
            aas, hypotheticals, bakta_version = parse_json_input(input_path, False, False, protein_json_flag=True)
        logger.info(f'Imported sequences={len(aas)}')
    except Exception as e:
        logger.error(f'ERROR: wrong file format or unallowed characters in amino acid sequences! {e}')
        sys.exit(1)

    mock_start = 1
    for aa in aas:  # rename and mock feature attributes to reuse existing functions
        aa['type'] = bc.FEATURE_CDS
        aa['locus'] = aa['id']
        aa['sequence'] = '-'
        aa['start'] = mock_start
        aa['stop'] = mock_start + aa['length'] - 1
        aa['strand'] = bc.STRAND_UNKNOWN
        aa['frame'] = 1
        mock_start += 100


    if fasta_flag:
        with faa_path.open('wt') as fh:
            for aa in aas:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")
    else: # write hypothetical proteins to file if JSON input
        with faa_path.open('wt') as fh:
            for aa in hypotheticals:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")

    logger.info('Parsing complete')

    return aas, bakta_version

`encode_annotations(annotations)`

Encodes annotations into a string.

Parameters:

Name	Type	Description	Default
`annotations`	`dict`	A dictionary containing the annotations.	required

Returns:

Name	Type	Description
`str`	`str`	The encoded annotations.

Examples:

>>> encode_annotations({
    'ID': 'EHICP_3230_sigpep',
    'Name': 'signal peptide',
    'product': 'signal peptide',
    'score': 0.5,
    'Parent': 'EHICP_3230'
})
'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'

Source code in src/baktfold/io/gff.py

def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str:
    """
    Encodes annotations into a string.

    Args:
      annotations (dict): A dictionary containing the annotations.

    Returns:
      str: The encoded annotations.

    Examples:
      >>> encode_annotations({
          'ID': 'EHICP_3230_sigpep',
          'Name': 'signal peptide',
          'product': 'signal peptide',
          'score': 0.5,
          'Parent': 'EHICP_3230'
      })
      'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
    """
    annotation_strings = []
    for key, val in annotations.items():
        if(type(val) is list):
            if(len(val) >= 1):
                val = [encode_attribute(k) for k in val]
                annotation = f"{key}={','.join(val)}"
                annotation_strings.append(annotation)
        else:
            annotation_strings.append(f'{key}={encode_attribute(val)}')
    return ';'.join(annotation_strings)

`encode_attribute(product)`

Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md

Source code in src/baktfold/io/gff.py

def encode_attribute(product: str) -> str:
    """Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md"""
    product = str(product)
    product = product.replace('%', '%25')
    product = product.replace(';', '%3B')
    product = product.replace('=', '%3D')
    product = product.replace('&', '%26')
    product = product.replace(',', '%2C')
    return product

`write_euk_cds_feature(fh, seq_id, feat)`

Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

Parameters

fh : file-handle seq_id : str

dict-like feature with keys:

"start", "stop", "strand", "locus", "starts", "stops"

Source code in src/baktfold/io/gff.py

def write_euk_cds_feature(fh, seq_id, feat):
    """
    Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

    Parameters
    ----------
    fh : file-handle
    seq_id : str
    feat : dict-like feature with keys:
            "start", "stop", "strand", "locus", "starts", "stops"
    """

    strand = feat.get("strand", "+")
    locus = feat.get("locus", "unknown")

    transcript_id = f"{locus}-T1"
    cds_id = f"{transcript_id}.cds"

    starts = feat.get("starts")
    stops = feat.get("stops")

    # -------------------------------
    # 1. Determine CDS sub-coordinates
    # -------------------------------
    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        cds_coords = list(zip(starts, stops))
    else:
        cds_coords = [(feat["start"], feat["stop"])]

    # -------------------------------
    # 2. Reverse order for negative strand
    # -------------------------------
    if strand == "-":
        cds_coords.reverse()

    # -------------------------------
    # 3. Emit CDS lines with correct phase
    # -------------------------------
    offset = 0

    for i, (cds_start, cds_stop) in enumerate(cds_coords, start=1):

        length = cds_stop - cds_start + 1
        phase = offset % 3
        offset += length

        attr = f"ID={cds_id}-{i};Parent={transcript_id}"

        fh.write(
            f"{seq_id}\tbaktfold\tCDS\t{cds_start}\t{cds_stop}"
            f"\t.\t{strand}\t{phase}\t{attr}\n"
        )

`write_euk_repeat_region_feature(fh, seq_id, feat)`

Writes a repeat region feature to a file.

Parameters:

Name	Type	Description	Default
`fh`	`file`	The file handle to write to.	required
`seq_id`	`str`	The sequence ID.	required
`feat`	`dict`	A dictionary containing the feature information.	required

Returns:

Type	Description
	None

Examples:

>>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
    "type": "repeat_region",
    "sequence": "DS571531.1",
    "start": 1470,
    "stop": 1716,
    "strand": "?",
    "family": "LINE2",
    "rpt_type": null,
    "repeat_unit": null,
    "product": null,
    "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
    "id": "EHICP_3230",
    "db_xrefs": [
        "SO:0000657"
    ]
})

Source code in src/baktfold/io/gff.py

def write_euk_repeat_region_feature(fh, seq_id, feat):
    """
    Writes a repeat region feature to a file.

    Args:
      fh (file): The file handle to write to.
      seq_id (str): The sequence ID.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
          "type": "repeat_region",
          "sequence": "DS571531.1",
          "start": 1470,
          "stop": 1716,
          "strand": "?",
          "family": "LINE2",
          "rpt_type": null,
          "repeat_unit": null,
          "product": null,
          "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
          "id": "EHICP_3230",
          "db_xrefs": [
              "SO:0000657"
          ]
      })
    """

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    id = feat['sequence']

    attrs = {
        "ID": f"{id}:{start}..{stop}",
        "gbkey": "repeat_region"
    }

    if feat.get('family') is not None:
        attrs["rpt_family"] = feat.get('family')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\trepeat_region\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_euk_trna_feature(fh, seq_id, feat)`

Write a tRNA feature to GFF3 with a top-level line and single exon.

Parameters

file-like

Open file handle to write GFF lines.

str

Sequence/contig ID.

SeqFeature

Biopython SeqFeature object of type 'tRNA'.

Notes

Generates one tRNA line and one exon line.
Includes optional 'product' qualifier.

Source code in src/baktfold/io/gff.py

def write_euk_trna_feature(fh, seq_id, feat):
    """
    Write a tRNA feature to GFF3 with a top-level line and single exon.

    Parameters
    ----------
    fh : file-like
        Open file handle to write GFF lines.
    seq_id : str
        Sequence/contig ID.
    feat : SeqFeature
        Biopython SeqFeature object of type 'tRNA'.

    Notes
    -----
    - Generates one tRNA line and one exon line.
    - Includes optional 'product' qualifier.
    """
    start = int(feat['start'])
    stop  = int(feat['stop'])

    strand = feat['strand']

    locus = feat['locus']

    trna_id = f"{locus}-T1"

    # Top-level tRNA attributes
    attrs = {
        "ID": trna_id,
        "Parent": locus
    }

    attrs = {}

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    # Write top-level tRNA line
    fh.write(f"{seq_id}\tbaktfold\ttRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    # Write exon line (tRNA single-exon)
    exon_id = f"{trna_id}.exßon1"
    exon_attrs = f"ID={exon_id};Parent={trna_id}"
    fh.write(f"{seq_id}\tbaktfold\texon\t{start}\t{stop}\t.\t{strand}\t.\t{exon_attrs}\n")

`write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)`

Write a 'utr' feature.

Source code in src/baktfold/io/gff.py

def write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False):
    """Write a 'utr' feature."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    locus = feat['locus']

    # Count occurrences for this locus
    count = locus_counter.get(locus, 0) + 1
    locus_counter[locus] = count

    # Construct ID with suffix -2, -3, etc.
    # For first entry we keep ID=locus (no -1)
    if count == 1:
        utr_id = locus
    else:
        utr_id = f"{locus}-{count}"

    # Top-level mRNA line
    attrs = {
        "ID": f"{utr_id}",
        "Parent": f"{locus}",
    }

# CAMXCT020000566.1	EMBL	three_prime_UTR	84568	84617	.	-	.	ID=id-C1SCF055_LOCUS8420;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr3p1%3B~source:feature;gbkey=3'UTR;locus_tag=C1SCF055_LOCUS8420
# CAMXCT020000566.1	EMBL	five_prime_UTR	136251	136259	.	-	.	ID=id-C1SCF055_LOCUS8420-2;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr5p1%3B~source:feature;gbkey=5'UTR;locus_tag=C1SCF055_LOCUS8420

    if feat.get('Note') is not None:
        attrs["Note"] = feat.get('note')


    attrs["gbkey"] = "3'UTR" if three else "5'UTR"

    if feat.get('Note') is not None:
        attrs["locus_tag"] = feat.get('locus')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    if three:
        gene_tag = 'three_prime_UTR'
    else:
        gene_tag = 'five_prime_UTR'

    fh.write(f"{seq_id}\tbaktfold\t{gene_tag}\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_features(data, features_by_sequence, gff3_path, prokka=False, euk=False, other_genbank=False, cds_tool='Prodigal:2.6', trna_program='tRNAscan-SE:2.0.12', tmrna_program='Aragorn', rrna_program='Infernal', ncrna_program='Infernal')`

Export features in GFF3 format.

Source code in src/baktfold/io/gff.py

def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path, prokka: bool = False, euk: bool = False, other_genbank: bool = False, cds_tool: str = "Prodigal:2.6", trna_program: str = "tRNAscan-SE:2.0.12", tmrna_program: str = "Aragorn", rrna_program: str = "Infernal", ncrna_program: str = "Infernal"):
    """Export features in GFF3 format."""
    logger.info(f'write features: path={gff3_path}')

    with gff3_path.open('wt') as fh:
        fh.write('##gff-version 3\n')  # GFF version
        fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version

        if(data['genome'].get('taxon', None)):  # write organism info
            fh.write(f"# organism {data['genome']['taxon']}\n")

        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')

        for seq in data['sequences']:  # write features
            if euk:
                locus_counter = {} # for UTRs

            fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region

            # write landmark region
            annotations = {
                'ID': seq['id'],
                'Name': seq['id']
            }
            if(seq['topology'] == bc.TOPOLOGY_CIRCULAR):
                annotations['Is_circular'] = 'true'
            annotations = encode_annotations(annotations)
            fh.write(f"{seq['id']}\tBaktfold\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")

            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                start = feat['start']
                stop = feat['stop']
                if('edge' in feat):
                    stop += seq['length']

                # euks
                if euk:
                    if(feat['type'] == bc.FEATURE_REPEAT):
                        write_euk_repeat_region_feature(fh, seq_id, feat)

                    if(feat['type'] == bc.FEATURE_5UTR or feat['type'] == bc.FEATURE_3UTR):
                        if feat['type'] == bc.FEATURE_3UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=True)
                        elif feat['type'] == bc.FEATURE_5UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

                if(feat['type'] == bc.FEATURE_T_RNA):

                    if euk:
                        write_euk_trna_feature(fh, seq_id, feat)
                    else:
                        trna_tool = "tRNAscan-SE"
                        if prokka:
                            trna_tool = "Aragorn"
                        if other_genbank:
                            trna_tool = trna_program

                        annotations = {
                            'ID': feat['locus'],
                            'Name': feat['product'],
                            'locus_tag': feat['locus'],
                            'product': feat['product'],
                            'Dbxref': feat.get('db_xrefs', [])
                        }
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('anti_codon', False)):
                            annotations['anti_codon'] = feat['anti_codon']
                        if(feat.get('amino_acid', False)):
                            annotations['amino_acid'] = feat['amino_acid']
                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'profile:tRNAscan:2.0'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{trna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{trna_tool}\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_TM_RNA):
                    # both prokka and bakta use Aragorn

                    tmrna_tool = "Aragorn"
                    if other_genbank:
                        tmrna_tool = tmrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('tag' in feat):
                        annotations['tag_peptide'] = feat['tag']['aa']
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['inference'] = 'profile:aragorn:1.2'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        if('tag' in feat):
                            annotations['tag_peptide'] = f"{feat['tag']['start']}..{feat['tag']['stop']}" if feat['strand'] == bc.STRAND_FORWARD else f"complement({feat['tag']['start']}..{feat['tag']['stop']})"
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_R_RNA):
                    rrna_tool = "Infernal"
                    if prokka:
                        rrna_tool = "barrnap"
                    if other_genbank:
                        rrna_tool = rrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{rrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA):
                    # both prokka and bakta use infernal for ncrna
                    ncrna_tool = "Infernal"
                    if other_genbank:
                        ncrna_tool = ncrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_NC_RNA_CLASS] = insdc.select_ncrna_class(feat)
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if(ba.RE_GENE_SYMBOL.fullmatch(feat['gene'])):  # discard non-standard ncRNA gene symbols
                            gene_annotations['gene'] = feat['gene']
                        else:
                            annotations.pop('gene', None)
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{ncrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_CRISPR):
                    crispr_tool = "PILER-CR"
                    if prokka:
                        crispr_tool = "MinCED"
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product']
                    }
                    feat_type = so.SO_CRISPR.name
                    if(cfg.compliant):
                        feat_type = bc.INSDC_FEATURE_REPEAT_REGION
                        annotations['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                        annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                        annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{crispr_tool}\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                    if(not cfg.compliant):
                        i = 0
                        # spacers and repeats wont exist if Prokka input
                        spacers = feat.get('spacers', [])
                        repeat = feat.get('repeat', [])
                        if len(spacers) > 0 and len(repeat) > 0: 
                            while i < len(feat['spacers']):
                                repeat = feat['repeats'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_repeat_{i+1}",
                                    'Parent': feat['id']
                                }
                                annotations = encode_annotations(annotations)
                                # will always be PILER here as prokka won't have any
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                                spacer = feat['spacers'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_spacer_{i+1}",
                                    'Parent': feat['id'],
                                    'sequence': spacer['sequence']
                                }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                                i += 1
                            if(len(feat['repeats']) - 1 == i):
                                repeat = feat['repeats'][i]
                                annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                elif feat['type'] == bc.FEATURE_CDS:
                    if euk:
                        write_euk_cds_feature(fh, seq_id, feat)
                    else:
                        annotations = {
                            'ID': feat.get('locus'),
                            'Name': feat.get('product'),
                            'locus_tag': feat.get('locus'),
                            'product': feat.get('product'),
                            'Dbxref': feat.get('db_xrefs', [])  # default to empty list if db_xrefs doesn't exist
                        }
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Pyrodigal'
                        if prokka: 
                            source = 'Prodigal'

                        if other_genbank:
                            source = cds_tool

                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'ab initio prediction:Pyrodigal:3.5'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                            if(ec_number is not None):
                                annotations['ec_number'] = ec_number
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        if('exception' in feat):
                            ex = feat['exception']
                            pos = f"{ex['start']}..{ex['stop']}"
                            if(feat['strand'] == bc.STRAND_REVERSE):
                                pos = f"complement({pos})"
                            annotations['transl_except']=f"(pos:{pos},aa:{ex['aa']})"
                            notes = annotations.get('Note', [])
                            notes.append(f"codon on position {ex['codon_position']} is a {ex['type']} codon")
                            if('Notes' not in annotations):
                                annotations['Note'] = notes
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                        if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                            write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_SORF):
                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if(feat.get('gene', None)):  # add gene annotation if available
                        annotations['gene'] = feat['gene']
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                        if(ec_number is not None):
                            annotations['ec_number'] = ec_number
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'inference': 'ab initio prediction:Bakta'
                        }
                        if(feat.get('gene', None)):
                            gene_annotations['gene'] = feat['gene']
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                        write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_GAP):
                    gap_tool="Bakta"
                    if prokka:
                        gap_tool="Prokka"
                    annotations = {
                        'ID': feat['id'],
                        'Name': f"gap ({feat['length']} bp)",
                        'product': f"gap ({feat['length']} bp)"
                    }
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{gap_tool}\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIC):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIV):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIT):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_GENE):
                    write_gene_feature(fh, seq_id, feat)
                elif(feat['type'] == bc.FEATURE_MRNA):
                    write_mrna_feature(fh, seq_id, feat)

        if(not cfg.compliant):
            fh.write('##FASTA\n')
            for seq in data['sequences']:  # write sequences
                fh.write(f">{seq['id']}\n")
                seq_nt = seq['nt'] if 'nt' in seq else seq['sequence']  # <1.10.0 compatibility
                fh.write(fasta.wrap_sequence(seq_nt))
    return

`write_gene_feature(fh, seq_id, feat)`

Write a 'gene' feature including fuzzy boundaries.

Source code in src/baktfold/io/gff.py

def write_gene_feature(fh, seq_id, feat):
    """Write a 'gene' feature including fuzzy boundaries."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    attrs = {
        "ID": f"{locus}"
    }

    if feat.get('gene') is not None:
        attrs["Name"] = feat.get('gene')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tgene\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_mrna_feature(fh, seq_id, feat)`

Write mRNA + implied exons based on join() structure.

Source code in src/baktfold/io/gff.py

def write_mrna_feature(fh, seq_id, feat):
    """Write mRNA + implied exons based on join() structure."""

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    mrna_id = f"{locus}-T1"

    # Top-level mRNA line
    attrs = {
        "ID": mrna_id,
        "Parent": f"{locus}",
    }

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    # Ensure db_xrefs exists and is a list
    db_xrefs = feat.get("db_xrefs", [])

    # Access note safely
    note = feat.get("note", None)


    if db_xrefs:

        key = "Dbxref"         
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)

    if note:

        key = "note"         # <-- you must define this
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tmRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    starts = feat.get("starts")
    stops  = feat.get("stops")
    strand = feat.get("strand")
    seq_id = feat.get("sequence")

    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        # For minus strand, exons must be written in reverse order (5'→3')
        if strand == "-":
            exon_parts = list(zip(starts, stops))
        else:
            exon_parts = list(zip(starts, stops))

        # Exons must be numbered in biological order (5' to 3')
        if strand == "-":
            exon_parts = exon_parts[::-1]   # reverse order

        # Write each exon to GFF
        for idx, (ex_start, ex_stop) in enumerate(exon_parts, start=1):
            exon_id = f"{mrna_id}.exon{idx}"
            exon_attrs = f"ID={exon_id};Parent={mrna_id}"
            fh.write(
                f"{seq_id}\tbaktfold\texon\t{ex_start}\t{ex_stop}\t.\t{strand}\t.\t{exon_attrs}\n"
            )
    else:
        # Single exon (no starts/stops provided)
        exon_start = feat["start"]
        exon_stop = feat["stop"]
        exon_id = f"{mrna_id}.exon1"
        exon_attrs = f"ID={exon_id};Parent={mrna_id}"

        fh.write(
            f"{seq_id}\tbaktfold\texon\t{exon_start}\t{exon_stop}"
            f"\t.\t{feat['strand']}\t.\t{exon_attrs}\n"
        )

`write_signal_peptide(fh, feat)`

Writes a signal peptide feature to a file.

Parameters:

Name	Type	Description	Default
`fh`	`file`	The file handle to write to.	required
`feat`	`dict`	A dictionary containing the feature information.	required

Returns:

Type	Description
	None

Examples:

>>> write_signal_peptide(fh, {
    'locus': 'EHICP_3230',
    'sequence': 'DS571531.1',
    'strand': '+',
    'signal_peptide': {
        'start': 1,
        'stop': 20,
        'score': 0.5
    }
})

Source code in src/baktfold/io/gff.py

def write_signal_peptide(fh, feat: dict):  # <1.10.0 compatibility
    """
    Writes a signal peptide feature to a file.

    Args:
      fh (file): The file handle to write to.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_signal_peptide(fh, {
          'locus': 'EHICP_3230',
          'sequence': 'DS571531.1',
          'strand': '+',
          'signal_peptide': {
              'start': 1,
              'stop': 20,
              'score': 0.5
          }
      })
    """
    sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
    annotations = {
        'ID': f"{feat['locus']}_sigpep",
        'Name': 'signal peptide',
        'product': 'signal peptide',
        'score': sig_peptide['score'],
        'Parent': feat['locus']
    }
    annotations = encode_annotations(annotations)
    seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
    fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")

`export_sequences(sequences, fasta_path, description=False, wrap=False)`

Write sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
    """Write sequences to Fasta file."""
    logger.info(f'write genome sequences: path={fasta_path}, description={description}, wrap={wrap}')

    with fasta_path.open('wt') as fh:
        for seq in sequences:
            if(description):
                fh.write(f">{seq['id']} {seq['description']}\n")
            else:
                fh.write(f">{seq['id']}\n")
            if(wrap):
                fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence']))  # <1.10.0 compatibility
            else:
                fh.write(seq['nt'])
                fh.write('\n')

`import_sequences(sequences_path, is_genomic=True, is_dna=True)`

Import raw sequences from Fasta file.

Source code in src/baktfold/io/fasta.py

def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
    """Import raw sequences from Fasta file."""
    sequences = []
    with xopen(str(sequences_path), threads=0) as fh:
        for record in SeqIO.parse(fh, 'fasta'):

            rid = record.id

            if "~PIPE~" in rid:
                logger.error(
                    f"Your proteins FASTA header has ~PIPE~ in the header"
                    "Please remove all instances of ~PIPE~ before running Baktfold as this creates downstream issues with Foldseek"
                )
            else:
                rid = rid.replace("|", "~PIPE~")

            sequence = {
                'id': rid,
                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else ''
            }

            raw_sequence = str(record.seq).upper()
            if('-' in raw_sequence):
                dash_count = raw_sequence.count('-')
                raw_sequence = raw_sequence.replace('-', '')
                logger.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count)
            if(is_dna):
                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                    raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
                sequence['nt'] = raw_sequence
            else:
                if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
                    raw_sequence = raw_sequence[:-1]
                    logger.warning('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence)
                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                    raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
                sequence['aa'] = raw_sequence
            sequence['length'] = len(raw_sequence)
            if(is_genomic):
                sequence['complete'] = False
                sequence['type'] = bc.REPLICON_CONTIG
                sequence['topology'] = bc.TOPOLOGY_LINEAR
            logger.info(
                f"imported: id={record.id}, length={sequence['length']}, description={sequence['description']}, genomic={is_genomic}, dna={is_dna}"
            )   
            sequences.append(sequence)
    return sequences

`wrap_sequence(sequence)`

Wraps a sequence into lines of 60 characters.

Parameters:

Name	Type	Description	Default
`sequence`	`str`	The sequence to wrap.	required

Returns:

Name	Type	Description
`str`		The wrapped sequence.

Examples:

>>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
'ARNDCQEGHILKMFPOSUTWYVBZXJ\n'

Notes

This function is used to format sequences in FASTA files.

Source code in src/baktfold/io/fasta.py

def wrap_sequence(sequence: str):
    """
    Wraps a sequence into lines of 60 characters.

    Args:
      sequence (str): The sequence to wrap.

    Returns:
      str: The wrapped sequence.

    Examples:
      >>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
      'ARNDCQEGHILKMFPOSUTWYVBZXJ\\n'

    Notes:
      This function is used to format sequences in FASTA files.
    """
    lines = []
    for i in range(0, len(sequence), FASTA_LINE_WRAPPING):
        lines.append(sequence[i:i + FASTA_LINE_WRAPPING])
    return '\n'.join(lines) + '\n'

`write_faa(features, faa_path)`

Write translated CDS sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def write_faa(features: Sequence[dict], faa_path: Path):
    """Write translated CDS sequences to Fasta file."""
    logger.info(f'write translated CDS/sORF: path={faa_path}')
    with faa_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF):
                fh.write(f">{feat['locus']} {feat['product']}\n{feat['aa']}\n")

`write_ffn(features, ffn_path)`

Write annotated nucleotide sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def write_ffn(features: Sequence[dict], ffn_path: Path):
    """Write annotated nucleotide sequences to Fasta file."""
    logger.info(f'write feature nucleotide sequences: path={ffn_path}')
    with ffn_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION, bc.FEATURE_CRISPR, bc.FEATURE_CDS, bc.FEATURE_SORF, bc.FEATURE_ORIC, bc.FEATURE_ORIV, bc.FEATURE_ORIT]):
                identifier = feat['locus'] if 'locus' in feat else feat['id']
                if(feat.get('product', '') != ''):
                    fh.write(f">{identifier} {feat['product']}\n{feat['nt']}\n")
                else:
                    fh.write(f">{identifier}\n{feat['nt']}\n")

Some code adapted from @mheinzinger

https://github.com/mheinzinger/ProstT5/blob/main/scripts/generate_foldseek_db.py

`create_foldseek_prostt5_gpu_db(fasta_aa, foldseek_db_path, db_dir, logdir)`

Convert a Foldseek DB with ProstT5 3Di predictions using Foldseek-GPU

Parameters:

Name	Type	Description	Default
`fasta_aa`	`Path`	Path to the amino-acid FASTA file.	required
`foldseek_db_path`	`Path`	Path to the directory where Foldseek database will be stored.	required
`db_dir`	`Path`	Path to the baktfold DB	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/create_foldseek_db.py

def create_foldseek_prostt5_gpu_db(
    fasta_aa: Path, foldseek_db_path: Path, db_dir: Path, logdir: Path
) -> None:
    """
    Convert a Foldseek DB with ProstT5 3Di predictions using Foldseek-GPU

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        db_dir (Path): Path to the baktfold DB
        logdir (Path): Path to the directory where logs will be stored.
    Returns:
        None
    """

    prostt5_db_path = Path(db_dir) / "prostt5_weights"

    foldseek_createdb_prostt5 = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"createdb {fasta_aa} {foldseek_db_path}  --prostt5-model {prostt5_db_path}  ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_createdb_prostt5)

`foldseek_tsv2db(in_tsv, out_db_name, db_type, logdir)`

Convert a Foldseek TSV file to a Foldseek database.

Parameters:

Name	Type	Description	Default
`in_tsv`	`Path`	Path to the input TSV file.	required
`out_db_name`	`Path`	Path for the output Foldseek database.	required
`db_type`	`int`	Type of the output database.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/create_foldseek_db.py

def foldseek_tsv2db(
    in_tsv: Path, out_db_name: Path, db_type: int, logdir: Path
) -> None:
    """
    Convert a Foldseek TSV file to a Foldseek database.

    Args:
        in_tsv (Path): Path to the input TSV file.
        out_db_name (Path): Path for the output Foldseek database.
        db_type (int): Type of the output database.
        logdir (Path): Path to the directory where logs will be stored.

    Returns:
        None
    """
    foldseek_tsv2db = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"tsv2db {in_tsv} {out_db_name}  --output-dbtype {str(db_type)} ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_tsv2db)

`generate_foldseek_db_from_aa_3di(fasta_aa, fasta_3di, foldseek_db_path, logdir, prefix)`

Generate Foldseek database from amino-acid and 3Di sequences.

Parameters:

Name	Type	Description	Default
`fasta_aa`	`Path`	Path to the amino-acid FASTA file.	required
`fasta_3di`	`Path`	Path to the 3Di FASTA file.	required
`foldseek_db_path`	`Path`	Path to the directory where Foldseek database will be stored.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required
`prefix`	`str`	Prefix for the Foldseek database.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/create_foldseek_db.py

def generate_foldseek_db_from_aa_3di(
    fasta_aa: Path, fasta_3di: Path, foldseek_db_path: Path, logdir: Path, prefix: str
) -> None:
    """
    Generate Foldseek database from amino-acid and 3Di sequences.

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        fasta_3di (Path): Path to the 3Di FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        logdir (Path): Path to the directory where logs will be stored.
        prefix (str): Prefix for the Foldseek database.

    Returns:
        None
    """
    # read in amino-acid sequences
    sequences_aa = {}
    for record in SeqIO.parse(fasta_aa, "fasta"):
        sequences_aa[record.id] = str(record.seq)

    # read in 3Di strings
    sequences_3di = {}
    for record in SeqIO.parse(fasta_3di, "fasta"):
        if not record.id in sequences_aa.keys():
            logger.warning(
                "Warning: ignoring 3Di entry {}, since it is not in the amino-acid FASTA file".format(
                    record.id
                )
            )
        else:
            sequences_3di[record.id] = str(record.seq)  #no upper if masked

    # assert that we parsed 3Di strings for all sequences in the amino-acid FASTA file
    missing_3di = [cds_id for cds_id in sequences_aa if cds_id not in sequences_3di]
    for cds_id in missing_3di:
        logger.warning(
            "Warning: entry {} in amino-acid FASTA file has no corresponding 3Di string".format(
                cds_id
            )
        )
        logger.warning("Removing: entry {} from the Foldseek database ".format(cds_id))
        del sequences_aa[cds_id]

    # write TSV files directly (streaming, no string accumulation)
    temp_aa_tsv: Path = Path(foldseek_db_path) / "aa.tsv"
    temp_3di_tsv: Path = Path(foldseek_db_path) / "3di.tsv"
    temp_header_tsv: Path = Path(foldseek_db_path) / "header.tsv"
    with open(temp_aa_tsv, "w") as aa_f, \
         open(temp_3di_tsv, "w") as di_f, \
         open(temp_header_tsv, "w") as hdr_f:
        for i, seq_id in enumerate(sequences_aa.keys()):
            idx = str(i + 1)
            aa_f.write(f"{idx}\t{sequences_aa[seq_id]}\n")
            di_f.write(f"{idx}\t{sequences_3di[seq_id]}\n")
            hdr_f.write(f"{idx}\t{seq_id}\n")

    # create foldseek db names

    short_db_name = f"{prefix}"
    aa_db_name: Path = Path(foldseek_db_path) / short_db_name
    tsv_db_name: Path = Path(foldseek_db_path) / f"{short_db_name}_ss"
    header_db_name: Path = Path(foldseek_db_path) / f"{short_db_name}_h"

    # create Foldseek database with foldseek tsv2db

    foldseek_tsv2db(temp_aa_tsv, aa_db_name, 0, logdir)
    foldseek_tsv2db(temp_3di_tsv, tsv_db_name, 0, logdir)
    foldseek_tsv2db(temp_header_tsv, header_db_name, 12, logdir)

    # clean up
    remove_file(temp_aa_tsv)
    remove_file(temp_3di_tsv)
    remove_file(temp_header_tsv)

`generate_foldseek_db_from_structures(fasta_aa, foldseek_db_path, structure_dir, logdir, prefix, proteins_flag)`

Generate Foldseek database from PDB files.

Parameters:

Name	Type	Description	Default
`fasta_aa`	`Path`	Path to the amino-acid FASTA file.	required
`foldseek_db_path`	`Path`	Path to the directory where Foldseek database will be stored.	required
`structure_dir`	`Path`	Path to the directory containing .pdb or .cif structure files.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required
`prefix`	`str`	Prefix for the Foldseek database.	required
`proteins_flag`	`bool`	Flag - True if proteins-compare is run	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/create_foldseek_db.py

def generate_foldseek_db_from_structures(
    fasta_aa: Path,
    foldseek_db_path: Path,
    structure_dir: Path,
    logdir: Path,
    prefix: str,
    proteins_flag: bool,
) -> None:
    """
    Generate Foldseek database from PDB files.

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        structure_dir (Path): Path to the directory containing .pdb or .cif structure files.
        logdir (Path): Path to the directory where logs will be stored.
        prefix (str): Prefix for the Foldseek database.
        proteins_flag (bool): Flag - True if proteins-compare is run

    Returns:
        None
    """

    # read in amino-acid sequences
    sequences_aa = {}
    for record in SeqIO.parse(fasta_aa, "fasta"):
        sequences_aa[record.id] = str(record.seq)

    # Index structure files by stem so per-CDS lookup is O(1).
    # The old code did an O(N) list-comprehension inside an O(K) loop —
    # O(K×N) total. For 50k CDS × 50k files that was ~2.5e9 string compares.
    structures_by_cds_id: dict = {}
    for file in os.listdir(structure_dir):
        if file.endswith(".pdb") or file.endswith(".cif"):
            stem = file[:-4]  # ".pdb" and ".cif" are both 4 chars
            structures_by_cds_id.setdefault(stem, []).append(file)

    num_structures = 0

    # Checks that ID is in the pdbs

    no_structure_cds_ids = []

    for cds_id in sequences_aa.keys():

        matching_files = structures_by_cds_id.get(cds_id, [])

        if len(matching_files) == 1:
            num_structures += 1

        # should never happen but in case
        elif len(matching_files) > 1:
            logger.warning(f"More than 1 structures found for {cds_id}")
            logger.warning("Taking the first one")
            num_structures += 1
        else:
            logger.warning(f"No structure found for {cds_id}")
            logger.warning(f"{cds_id} will be ignored in annotation")
            no_structure_cds_ids.append(cds_id)

    if num_structures == 0:
        logger.error(
            f"No structures with matching CDS ids were found at all. Check the {structure_dir} directory"
        )

    # generate the db
    short_db_name = f"{prefix}"
    structure_db_name: Path = Path(foldseek_db_path) / short_db_name
    query_structure_dir = structure_dir


    foldseek_createdb_from_structures = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"createdb {query_structure_dir} {structure_db_name} ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_createdb_from_structures)

`create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)`

Create a TSV file containing the results of a Foldseek search.

Parameters:

Name	Type	Description	Default
`query_db`	`Path`	Path to the query database.	required
`target_db`	`Path`	Path to the target database.	required
`result_db`	`Path`	Path to the result database generated by the search.	required
`result_tsv`	`Path`	Path to save the resulting TSV file.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required
`foldseek_gpu`	`bool`	Run Foldseek-GPU with accelerate ungapped prefilter	required
`structures`	`bool`	Whether structures were input (not ProstT5)	required
`threads`	`int`	Number of threads to use.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/run_foldseek.py

def create_result_tsv(
    query_db: Path, target_db: Path, result_db: Path, result_tsv: Path, logdir: Path, foldseek_gpu: bool, structures: bool, threads: int
) -> None:
    """
    Create a TSV file containing the results of a Foldseek search.

    Args:
        query_db (Path): Path to the query database.
        target_db (Path): Path to the target database.
        result_db (Path): Path to the result database generated by the search.
        result_tsv (Path): Path to save the resulting TSV file.
        logdir (Path): Path to the directory where logs will be stored.
        foldseek_gpu (bool): Run Foldseek-GPU with accelerate ungapped prefilter
        structures (bool): Whether structures were input (not ProstT5)
        threads (int): Number of threads to use.

    Returns:
        None
    """
    if structures:
        format_string= "--format-output query,target,bits,fident,evalue,qstart,qend,qlen,tstart,tend,tlen,alntmscore,lddt"
    else:
        format_string = "--format-output query,target,bits,fident,evalue,qstart,qend,qlen,tstart,tend,tlen"
    if foldseek_gpu:
        target_db = f"{target_db}_gpu"


    cmd = f"convertalis {query_db} {target_db} {result_db} {result_tsv} {format_string} --threads {threads}"

    foldseek_createtsv = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
    )


    ExternalTool.run_tool(foldseek_createtsv)

`run_foldseek_search(query_db, target_db, result_db, temp_db, threads, logdir, evalue, sensitivity, max_seqs, ultra_sensitive, extra_foldseek_params, foldseek_gpu, structures, gpus=None)`

Run a Foldseek search using given parameters.

Parameters:

Name	Type	Description	Default
`query_db`	`Path`	Path to the query database.	required
`target_db`	`Path`	Path to the target database.	required
`result_db`	`Path`	Path to store the result database.	required
`temp_db`	`Path`	Path to store temporary files.	required
`threads`	`int`	Number of threads to use for the search.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required
`evalue`	`float`	E-value threshold for the search.	required
`sensitivity`	`float`	Sensitivity threshold for the search.	required
`max_seqs`	`int`	Maximum results per query sequence allowed to pass the prefilter for foldseek.	required
`ultra_sensitive`	`bool`	Whether to skip foldseek prefilter for maximum sensitivity	required
`extra_foldseek_params`	`str`	Extra foldseek search params	required
`foldseek_gpu`	`bool`	Run Foldseek-GPU with accelerate ungapped prefilter	required
`structures`	`bool`	Run Foldseek with structures, not ProstT5 3Dis	required
`gpus`	`Optional[str]`	Comma-separated CUDA indices (e.g. "0,2") to restrict foldseek's GPU prefilter to a subset of devices. When `foldseek_gpu` is True and this resolves to ≥1 CUDA device, the foldseek subprocess gets `CUDA_VISIBLE_DEVICES` set accordingly. None = use all visible CUDA GPUs (foldseek default). Ignored when `foldseek_gpu` is False.	`None`

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/run_foldseek.py

def run_foldseek_search(
    query_db: Path,
    target_db: Path,
    result_db: Path,
    temp_db: Path,
    threads: int,
    logdir: Path,
    evalue: float,
    sensitivity: float,
    max_seqs: int,
    ultra_sensitive: bool,
    extra_foldseek_params: str,
    foldseek_gpu: bool,
    structures: bool,
    gpus: Optional[str] = None,
) -> None:
    """
    Run a Foldseek search using given parameters.

    Args:
        query_db (Path): Path to the query database.
        target_db (Path): Path to the target database.
        result_db (Path): Path to store the result database.
        temp_db (Path): Path to store temporary files.
        threads (int): Number of threads to use for the search.
        logdir (Path): Path to the directory where logs will be stored.
        evalue (float): E-value threshold for the search.
        sensitivity (float): Sensitivity threshold for the search.
        max_seqs (int): Maximum results per query sequence allowed to pass the prefilter for foldseek.
        ultra_sensitive (bool): Whether to skip foldseek prefilter for maximum sensitivity
        extra_foldseek_params (str): Extra foldseek search params
        foldseek_gpu (bool): Run Foldseek-GPU with accelerate ungapped prefilter
        structures (bool): Run Foldseek with structures, not ProstT5 3Dis
        gpus (Optional[str]): Comma-separated CUDA indices (e.g. "0,2") to
            restrict foldseek's GPU prefilter to a subset of devices. When
            ``foldseek_gpu`` is True and this resolves to ≥1 CUDA device,
            the foldseek subprocess gets ``CUDA_VISIBLE_DEVICES`` set
            accordingly. None = use all visible CUDA GPUs (foldseek default).
            Ignored when ``foldseek_gpu`` is False.

    Returns:
        None
    """

    if ultra_sensitive:
        cmd = f"search {query_db} {target_db} {result_db} {temp_db} --threads {str(threads)} -e {evalue} -s {sensitivity} --exhaustive-search"
    else:
        cmd = f"search {query_db} {target_db} {result_db} {temp_db} --threads {str(threads)} -e {evalue} -s {sensitivity} --max-seqs {max_seqs}"

    # support foldseek gpu only for the regular DB search for now
    if foldseek_gpu:
        cmd = f"search {query_db} {target_db}_gpu {result_db} {temp_db} --threads {str(threads)} -e {evalue}  --gpu 1 --prefilter-mode 1 --max-seqs {max_seqs}"

    if extra_foldseek_params:
        cmd += f" {extra_foldseek_params}"

    # need -a 1 to compute the alignment so tmscore and lddt can be output (if using --structures)
    if structures:
        cmd += f" -a 1"

    # Build optional env for multi-GPU foldseek. Only applies when GPU mode is
    # on; foldseek selects devices via CUDA_VISIBLE_DEVICES (per its README).
    env = None
    if foldseek_gpu and gpus is not None:
        devices = parse_gpus(cpu=False, gpus=gpus)
        cvd = cuda_visible_devices_value(devices)
        if cvd is not None:
            env = {"CUDA_VISIBLE_DEVICES": cvd}

    foldseek_search = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
        env=env,
    )

    ExternalTool.run_tool(foldseek_search)

`summarise_hits(result_db, result_db_greedy_best_hits, logdir, threads)`

Get all non-overlapping tophits covering a query (designed for CATH)

Parameters:

Name	Type	Description	Default
`result_db`	`Path`	Path to the result database generated by the search.	required
`result_db_greedy_best_hits`	`Path`	Path to save the greedy best hits results db.	required
`logdir`	`Path`	Path to the directory where logs will be stored.	required
`threads`	`int`	Number of threads to use.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/features/run_foldseek.py

def summarise_hits(result_db: Path, result_db_greedy_best_hits: Path, logdir: Path, threads: int) -> None:
    """
    Get all non-overlapping tophits covering a query (designed for CATH)

    Args:
        result_db (Path): Path to the result database generated by the search.
        result_db_greedy_best_hits (Path): Path to save the greedy best hits results db.
        logdir (Path): Path to the directory where logs will be stored.
        threads (int): Number of threads to use.

    Returns:
        None
    """

    cmd = f"summarizeresult  {result_db} {result_db_greedy_best_hits} --threads {threads} -a 1"

    foldseek_summarizeresult = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_summarizeresult)

3Di prediction for baktfold — wraps pholdlib's shared inference engine.

Baktfold-specific: flat cds_dict (no contig nesting), Bakta hypotheticals format with in-place annotation updates, has_duplicate_locus support.

Code adapted from @mheinzinger https://github.com/mheinzinger/ProstT5/blob/main/scripts/predict_3Di_encoderOnly.py

`get_embeddings(hypotheticals, cds_dict, out_path, prefix, model_dir, model_name, checkpoint_path, output_3di, output_h5_per_residue, output_h5_per_protein, half_precision, max_residues=100000, max_seq_len=30000, max_batch=10000, cpu=False, output_probs=True, save_per_residue_embeddings=False, save_per_protein_embeddings=False, threads=1, mask_threshold=0, has_duplicate_locus=False, gpus=None)`

Run ProstT5 + CNN 3Di prediction for all sequences in cds_dict.

Parameters:

Name	Type	Description	Default
`hypotheticals`	`List[Dict]`	List of Bakta feature dicts (mutated in-place with "3di").	required
`cds_dict`	`Dict[str, str]`	Flat `{seq_id: amino_acid_str}` dict.	required
`out_path`	`Path`	Directory for output files.	required
`prefix`	`str`	Filename prefix for CSV / JSONL outputs.	required
`model_dir`	`Path`	Directory where ProstT5 is cached.	required
`model_name`	`str`	HuggingFace model identifier.	required
`checkpoint_path`	`Path`	Path to the CNN `.pt` checkpoint.	required
`output_3di`	`Path`	Output FASTA path for 3Di sequences.	required
`output_h5_per_residue`	`Path`	HDF5 path for per-residue embeddings.	required
`output_h5_per_protein`	`Path`	HDF5 path for per-protein embeddings.	required
`half_precision`	`bool`	If True, cast model + predictor to fp16 after loading.	required
`max_residues`	`int`	Max total residues per inference batch.	`100000`
`max_seq_len`	`int`	Sequences longer than this flush a batch immediately.	`30000`
`max_batch`	`int`	Max sequences per batch.	`10000`
`cpu`	`bool`	Force CPU inference.	`False`
`output_probs`	`bool`	Whether to write per-residue probability JSONL.	`True`
`save_per_residue_embeddings`	`bool`	Save per-residue HDF5.	`False`
`save_per_protein_embeddings`	`bool`	Save per-protein HDF5.	`False`
`threads`	`int`	Number of CPU threads for torch.	`1`
`mask_threshold`	`float`	Residues with max softmax prob < threshold/100 → 'X'.	`0`
`has_duplicate_locus`	`bool`	If True use feat["id"] rather than feat["locus"].	`False`
`gpus`	`Optional[str]`	Comma-separated CUDA indices (e.g. "0,2"). None = auto-detect all visible CUDA GPUs. Overridden by `cpu=True`.	`None`

Returns:

Name	Type	Description
`predictions`	`Dict`	Flat `{seq_id: (pred, mean_prob, all_prob)}` dict, in original cds_dict key order.

Source code in src/baktfold/features/predict_3Di.py

def get_embeddings(
    hypotheticals: List[Dict],
    cds_dict: Dict[str, str],
    out_path: Path,
    prefix: str,
    model_dir: Path,
    model_name: str,
    checkpoint_path: Path,
    output_3di: Path,
    output_h5_per_residue: Path,
    output_h5_per_protein: Path,
    half_precision: bool,
    max_residues: int = 100000,
    max_seq_len: int = 30000,
    max_batch: int = 10000,
    cpu: bool = False,
    output_probs: bool = True,
    save_per_residue_embeddings: bool = False,
    save_per_protein_embeddings: bool = False,
    threads: int = 1,
    mask_threshold: float = 0,
    has_duplicate_locus: bool = False,
    gpus: Optional[str] = None,
) -> Dict:
    """Run ProstT5 + CNN 3Di prediction for all sequences in *cds_dict*.

    Args:
        hypotheticals: List of Bakta feature dicts (mutated in-place with "3di").
        cds_dict: Flat ``{seq_id: amino_acid_str}`` dict.
        out_path: Directory for output files.
        prefix: Filename prefix for CSV / JSONL outputs.
        model_dir: Directory where ProstT5 is cached.
        model_name: HuggingFace model identifier.
        checkpoint_path: Path to the CNN ``.pt`` checkpoint.
        output_3di: Output FASTA path for 3Di sequences.
        output_h5_per_residue: HDF5 path for per-residue embeddings.
        output_h5_per_protein: HDF5 path for per-protein embeddings.
        half_precision: If True, cast model + predictor to fp16 after loading.
        max_residues: Max total residues per inference batch.
        max_seq_len: Sequences longer than this flush a batch immediately.
        max_batch: Max sequences per batch.
        cpu: Force CPU inference.
        output_probs: Whether to write per-residue probability JSONL.
        save_per_residue_embeddings: Save per-residue HDF5.
        save_per_protein_embeddings: Save per-protein HDF5.
        threads: Number of CPU threads for torch.
        mask_threshold: Residues with max softmax prob < threshold/100 → 'X'.
        has_duplicate_locus: If True use feat["id"] rather than feat["locus"].
        gpus: Comma-separated CUDA indices (e.g. "0,2"). None = auto-detect
              all visible CUDA GPUs. Overridden by ``cpu=True``.

    Returns:
        predictions: Flat ``{seq_id: (pred, mean_prob, all_prob)}`` dict,
                     in original cds_dict key order.
    """
    # ── resolve devices ─────────────────────────────────────────────────────
    devices = parse_gpus(cpu, gpus)
    logger.info(f"Beginning ProstT5 predictions on device(s): {devices}")
    if half_precision and devices == ["cpu"]:
        logger.info("CPU device — forcing full-precision (half-precision disabled).")
        half_precision = False
    if half_precision:
        logger.info("Using models in half-precision")
    else:
        logger.info("Using models in full-precision")

    # ── build seq_dict (skip empty / non-string entries) ────────────────────
    original_keys = list(cds_dict.keys())
    seq_dict: List[Tuple] = []
    fail_ids: List[str] = []

    for k, seq in cds_dict.items():
        if isinstance(seq, str) and seq:
            seq_dict.append((k, seq, len(seq)))
        else:
            logger.warning(
                f"Protein header {k} is corrupt or empty — will be saved in fails.tsv"
            )
            fail_ids.append(k)

    # sort descending by length (minimises padding in each batch)
    seq_dict.sort(key=lambda x: x[2], reverse=True)

    # ── run shared inference engine (single- or multi-GPU) ──────────────────
    predictions, emb_res, emb_prot, inf_fail_ids = run_prostt5_inference_multi_gpu(
        seq_dict,
        devices=devices,
        model_dir=model_dir,
        model_name=model_name,
        checkpoint_path=checkpoint_path,
        half_precision=half_precision,
        threads=threads,
        check_fn=check_prostT5_download,
        zenodo_fn=download_zenodo_prostT5,
        max_residues=max_residues,
        max_seq_len=max_seq_len,
        max_batch=max_batch,
        output_probs=output_probs,
        save_per_residue_embeddings=save_per_residue_embeddings,
        save_per_protein_embeddings=save_per_protein_embeddings,
        desc="Predicting 3Di",
    )
    fail_ids.extend(inf_fail_ids)

    # restore original key order
    predictions = {k: predictions[k] for k in original_keys if k in predictions}

    # ── write outputs ────────────────────────────────────────────────────────
    if fail_ids:
        write_fail_ids(fail_ids, Path(out_path) / "fails.tsv")

    write_predictions(
        hypotheticals, predictions, output_3di, mask_threshold, has_duplicate_locus
    )

    if save_per_residue_embeddings:
        write_embeddings(emb_res, output_h5_per_residue)
    if save_per_protein_embeddings:
        write_embeddings(emb_prot, output_h5_per_protein)

    mean_probs_path = Path(out_path) / f"{prefix}_prostT5_3di_mean_probabilities.csv"
    all_probs_path = (
        Path(out_path) / f"{prefix}_prostT5_3di_all_probabilities.json"
        if output_probs else None
    )
    write_probs(predictions, mean_probs_path, all_probs_path, original_keys)

    return predictions

`write_embeddings(embeddings, out_path)`

Write per-residue or per-protein embeddings to HDF5 (flat key structure).

Streams into a sibling temp file and renames it onto out_path on success. A crash mid-write (OOM, Ctrl-C, disk full) leaves the original out_path (if any) untouched rather than a truncated .h5 that the next run would silently accept as valid.

Source code in src/baktfold/features/predict_3Di.py

def write_embeddings(embeddings: Dict[str, Any], out_path: Path) -> None:
    """Write per-residue or per-protein embeddings to HDF5 (flat key structure).

    Streams into a sibling temp file and renames it onto ``out_path`` on
    success.  A crash mid-write (OOM, Ctrl-C, disk full) leaves the original
    ``out_path`` (if any) untouched rather than a truncated .h5 that the
    next run would silently accept as valid.
    """
    with atomic_write_path(out_path) as tmp:
        with h5py.File(str(tmp), "w") as hf:
            for sequence_id, embedding in embeddings.items():
                hf.create_dataset(sequence_id, data=embedding)

`write_predictions(hypotheticals, predictions, out_path, mask_threshold, has_duplicate_locus=False)`

Write 3Di predictions to FASTA and update Bakta hypotheticals in-place.

Parameters:

Name	Type	Description	Default
`hypotheticals`	`List[Dict]`	List of Bakta feature dicts. Each is mutated in-place with a `"3di"` key set to the predicted 3Di string (or None if prediction failed / was skipped).	required
`predictions`	`Dict[str, Tuple]`	Flat `{seq_id: (pred, mean_prob, all_prob)}` dict.	required
`out_path`	`Path`	Output FASTA path.	required
`mask_threshold`	`float`	Residues with max softmax prob (0–100) below this threshold are replaced with 'X'.	required
`has_duplicate_locus`	`bool`	If True, use `feat["id"]` as seq_id (needed for eukaryotic inputs that may have duplicate locus tags). Otherwise use `feat["locus"]`.	`False`

Source code in src/baktfold/features/predict_3Di.py

def write_predictions(
    hypotheticals: List[Dict],
    predictions: Dict[str, Tuple],
    out_path: Path,
    mask_threshold: float,
    has_duplicate_locus: bool = False,
) -> None:
    """Write 3Di predictions to FASTA and update Bakta hypotheticals in-place.

    Args:
        hypotheticals: List of Bakta feature dicts. Each is mutated in-place
                       with a ``"3di"`` key set to the predicted 3Di string
                       (or None if prediction failed / was skipped).
        predictions: Flat ``{seq_id: (pred, mean_prob, all_prob)}`` dict.
        out_path: Output FASTA path.
        mask_threshold: Residues with max softmax prob (0–100) below this
                        threshold are replaced with 'X'.
        has_duplicate_locus: If True, use ``feat["id"]`` as seq_id (needed for
                             eukaryotic inputs that may have duplicate locus tags).
                             Otherwise use ``feat["locus"]``.
    """
    mask_prop = mask_threshold / 100

    # drop zero-length predictions (issue #47)
    predictions = {k: v for k, v in predictions.items() if len(v[0]) > 0}

    # apply confidence masking in-place on pred index arrays
    for seq_id, (pred, mean_prob, all_prob) in predictions.items():
        pred[all_prob[0] < mask_prop] = 20  # 'X'

    with open(out_path, "w") as out_f:
        for feat in hypotheticals:
            seq_id = feat["id"] if has_duplicate_locus else feat["locus"]
            pred_tuple = predictions.get(seq_id)
            if pred_tuple is not None:
                yhats = pred_tuple[0]
                threedi_seq = "".join(SS_MAPPING[int(y)] for y in yhats)
                feat["3di"] = threedi_seq  # mutate Bakta feature dict in-place
                out_f.write(f">{seq_id}\n{threedi_seq}\n")
            else:
                feat["3di"] = None  # no prediction (OOM / corrupt entry)

    logger.info(f"Finished writing 3Di FASTA to {out_path}")

`autotune_batching_real_data(model_dir, model_name, cpu, threads, probe_seqs, start_bs=1, max_bs=100, step=5, device=None)`

Autotunes the batch size for a given model and set of sequences.

Parameters:

Name	Type	Description	Default
`model_dir`	`str`	The directory where the model is stored.	required
`model_name`	`str`	The name of the model.	required
`cpu`	`bool`	Whether to use the CPU or not.	required
`threads`	`int`	The number of threads to use.	required
`probe_seqs`	`list`	A list of sequences to use for probing.	required
`start_bs`	`int`	The starting batch size to use.	`1`
`max_bs`	`int`	The maximum batch size to use.	`100`
`step`	`int`	The step size to use when increasing the batch size.	`5`
`device`	`Optional[str]`	Torch device string (e.g. "cuda:1") to pin autotune to a specific GPU. None preserves the original auto-detection behaviour. Used by the multi-GPU caller.	`None`

Returns:

Name	Type	Description
`int`		The optimal batch size.
`int`		The maximum number of residues per batch.

Examples:

>>> autotune_batching_real_data("model_dir", "model_name", True, 4, ["ATCG", "GCTA"], 1, 100, 5)
(10, 100)

Source code in src/baktfold/features/autotune.py

def autotune_batching_real_data(
    model_dir,
    model_name,
    cpu,
    threads,
    probe_seqs,
    start_bs=1,
    max_bs=100,
    step=5, # step size
    device: Optional[str] = None,
):
    """
    Autotunes the batch size for a given model and set of sequences.

    Args:
      model_dir (str): The directory where the model is stored.
      model_name (str): The name of the model.
      cpu (bool): Whether to use the CPU or not.
      threads (int): The number of threads to use.
      probe_seqs (list): A list of sequences to use for probing.
      start_bs (int): The starting batch size to use.
      max_bs (int): The maximum batch size to use.
      step (int): The step size to use when increasing the batch size.
      device (Optional[str]): Torch device string (e.g. "cuda:1") to pin
        autotune to a specific GPU. None preserves the original
        auto-detection behaviour. Used by the multi-GPU caller.

    Returns:
      int: The optimal batch size.
      int: The maximum number of residues per batch.

    Examples:
      >>> autotune_batching_real_data("model_dir", "model_name", True, 4, ["ATCG", "GCTA"], 1, 100, 5)
      (10, 100)
    """

    model, tokenizer, device = get_T5_model(
        model_dir, model_name, cpu, threads, device=device
    )
    model.eval()
    model.half()

    bs = start_bs
    results = []


    while bs <= max_bs:
        try:

            # seqs = probe_seqs
            n_tokens = sum(len(s) for s in probe_seqs)

            logger.info(f"Running with batch size {bs}")

            model.eval()

            total_tokens = 0
            total_time = 0.0
            batches = 0

            # iterate over real sequences in batches
            for i in tqdm(range(0, len(probe_seqs), bs), desc="Processing"):
                batch_seqs = probe_seqs[i : i + bs]

                n_tokens = sum(len(s) for s in batch_seqs)
                total_tokens += n_tokens

                inputs = tokenizer(
                    batch_seqs,
                    padding=True,
                    return_tensors="pt",
                )
                inputs.pop("token_type_ids", None)
                inputs = {k: v.to(device) for k, v in inputs.items()}

                # timing — device_synchronize handles CUDA/MPS/XPU/CPU (PR #129)
                device_synchronize(device)
                t0 = time.perf_counter()
                with torch.no_grad():
                    _ = model(**inputs)
                device_synchronize(device)

                total_time += time.perf_counter() - t0

                batches += 1

            time_per_token = total_time / total_tokens


            token_per_batch = math.floor(total_tokens / batches)


            results.append({
                "bs": bs,
                "tokens_per_batch": token_per_batch,
                "time": total_time,
                "time_per_token": time_per_token,
            })

            logger.info(f"Time elapsed {round(total_time,5)}")
            logger.info(f"Tokens per batch {token_per_batch}")

            bs += step

        except (torch.cuda.OutOfMemoryError, RuntimeError):
            # RuntimeError covers XPU/MPS OOM; torch.cuda.OutOfMemoryError covers CUDA.
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            break


    if not results:
        raise RuntimeError("No batch size fits on this GPU")

    best_entry = min(results, key=lambda x: x["time_per_token"])

    best_bs = best_entry["bs"]
    best_residues = best_entry["tokens_per_batch"]
    # best_tpt = best_bs["time_per_token"]

    logger.info(f"##########################")
    logger.info(f"Best batch size: {best_bs}")
    # logger.info(f"best max residues: {best_residues}")

    return best_bs, best_residues

`run_autotune(input_path, model_dir, model_name, cpu, threads, step, min_batch, max_batch, sample_seqs, gpus=None)`

Runs the batch size autotuning process.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	The path to the input file.	required
`model_dir`	`str`	The directory where the model is stored.	required
`model_name`	`str`	The name of the model.	required
`cpu`	`bool`	Whether to use the CPU or not.	required
`threads`	`int`	The number of threads to use.	required
`step`	`int`	The step size to use when increasing the batch size.	required
`min_batch`	`int`	The minimum batch size to use.	required
`max_batch`	`int`	The maximum batch size to use.	required
`sample_seqs`	`int`	The number of sequences to sample for probing.	required
`gpus`	`Optional[str]`	Comma-separated CUDA indices (e.g. "0,2"). When set, autotune runs on the lowest selected index. Default None = existing behaviour (cuda:0 / mps / xpu / cpu auto-detect).	`None`

Returns:

Name	Type	Description
`int`		The optimal batch size.

Examples:

>>> run_autotune("input_path", "model_dir", "model_name", True, 4, 5, 1, 100, 10)
10

Source code in src/baktfold/features/autotune.py

def run_autotune(
    input_path,
    model_dir,
    model_name,
    cpu,
    threads,
    step,
    min_batch,
    max_batch,
    sample_seqs,
    gpus: Optional[str] = None,
):
    """
    Runs the batch size autotuning process.

    Args:
      input_path (str): The path to the input file.
      model_dir (str): The directory where the model is stored.
      model_name (str): The name of the model.
      cpu (bool): Whether to use the CPU or not.
      threads (int): The number of threads to use.
      step (int): The step size to use when increasing the batch size.
      min_batch (int): The minimum batch size to use.
      max_batch (int): The maximum batch size to use.
      sample_seqs (int): The number of sequences to sample for probing.
      gpus (Optional[str]): Comma-separated CUDA indices (e.g. "0,2"). When
        set, autotune runs on the lowest selected index. Default None =
        existing behaviour (cuda:0 / mps / xpu / cpu auto-detect).

    Returns:
      int: The optimal batch size.

    Examples:
      >>> run_autotune("input_path", "model_dir", "model_name", True, 4, 5, 1, 100, 10)
      10
    """

    # Resolve devices early so we can pick the autotune GPU (homogeneous-card
    # assumption: same batch size applies to every GPU we'll later use).
    devices = parse_gpus(cpu, gpus)
    autotune_device: Optional[str] = None
    if len(devices) >= 1 and devices != ["cpu"]:
        autotune_device = devices[0]
    if len(devices) > 1:
        logger.info(
            f"Multi-GPU detected ({len(devices)} devices); autotuning on "
            f"{autotune_device} and applying the chosen batch to all devices."
        )

    # Dictionary to store the records
    cds_dict = {}


    with open_protein_fasta_file(input_path) as handle:  # handles gzip too
        records = list(SeqIO.parse(handle, "fasta"))
        if not records:
            logger.warning(f"No proteins were found in your input file {input_path}.")
            logger.error(
                f"Your input file {input_path} is likely not a amino acid FASTA file. Please check this."
            )
        for record in records:
            prot_id = record.id
            feature_location = FeatureLocation(0, len(record.seq))
            # Seq needs to be saved as the first element in list hence the closed brackets [str(record.seq)]
            seq_feature = SeqFeature(
                feature_location,
                type="CDS",
                qualifiers={
                    "ID": record.id,
                    "description": record.description,
                    "translation": str(record.seq),
                },
            )

            cds_dict[prot_id] = seq_feature

    if not cds_dict:
        logger.error(f"Error: no AA protein sequences found in {input_path} file")


    seqs = []
    for feat in cds_dict.values():
        v = feat.qualifiers.get("translation")
        if v and isinstance(v, str):
            seqs.append(v)

    logger.info("Beginning batch size tuning")
    logger.info(f"Using minimum batch size of 1 and maximum batch size of {max_batch}")

    # define the sampling

    probe_seqs = sample_probe_sequences(seqs, n=sample_seqs)

    batch_size, max_residues = autotune_batching_real_data(
        model_dir,
        model_name,
        cpu,
        threads,
        probe_seqs,
        start_bs=min_batch,
        max_bs=max_batch,
        step=step, # step size
        device=autotune_device,
    )

    logger.info(f"Optimal batch size is {batch_size} (residues per batch {max_residues})")

    return batch_size

`sample_probe_sequences(seqs, n=5000, seed=0)`

samples sequences

Source code in src/baktfold/features/autotune.py

def sample_probe_sequences(seqs, n=5000, seed=0):
    """
    samples sequences 

    """

    rng = random.Random(seed)

    if n >= len(seqs):
        sampled = list(seqs)
    else:
        sampled = rng.sample(seqs, n)

    # sort by sequence length
    sampled.sort(key=len, reverse=True)

    return sampled

`annotate_aa(aas)`

Combines IPS and PSC annotations and marks hypotheticals.

Parameters:

Name	Type	Description	Default
`aas`	`Sequence[dict]`	A sequence of amino acid dictionaries to annotate.	required

Returns:

Type	Description
	None

Examples:

>>> aas = [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
>>> annotate_aa(aas)
>>> aas
[{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]

Source code in src/baktfold/bakta/annotation.py

def annotate_aa(aas: Sequence[dict]):
    """
    Combines IPS and PSC annotations and marks hypotheticals.

    Args:
      aas (Sequence[dict]): A sequence of amino acid dictionaries to annotate.

    Returns:
      None

    Examples:
      >>> aas = [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
      >>> annotate_aa(aas)
      >>> aas
      [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
    """

    print('\tcombine annotations and mark hypotheticals...')

    for aa in aas:
        print(aa)
        combine_annotation(aa)  # combine IPS & PSC annotations and mark hypothetical
    logger.debug('analyze hypotheticals')
    hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    if(len(hypotheticals) > 0):
        print(f'\tanalyze hypothetical proteins: {len(hypotheticals)}')
        print('\tcalculated proteins statistics')

`assign_annotation_confidence(entry, prostt5_confidence, structures)`

Classify a Foldseek hit into a confidence tier: 'high', 'medium' or 'low'.

Mirrors phold's heuristic (phold.subcommands.compare.assign_annotation_confidence). entry is the hierarchy-selected pstc hit carrying query_cov (qCov), subject_cov (tCov), identity (fident) and evalue. prostt5_confidence is the protein's mean ProstT5 confidence (0-100), or None for structure input. With structures (or no ProstT5 confidence available) the ProstT5 criteria are dropped.

Source code in src/baktfold/bakta/annotation.py

def assign_annotation_confidence(entry: dict, prostt5_confidence, structures: bool) -> str:
    """Classify a Foldseek hit into a confidence tier: 'high', 'medium' or 'low'.

    Mirrors phold's heuristic (phold.subcommands.compare.assign_annotation_confidence).
    ``entry`` is the hierarchy-selected pstc hit carrying ``query_cov`` (qCov),
    ``subject_cov`` (tCov), ``identity`` (fident) and ``evalue``.
    ``prostt5_confidence`` is the protein's mean ProstT5 confidence (0-100), or
    None for structure input. With ``structures`` (or no ProstT5 confidence
    available) the ProstT5 criteria are dropped.
    """
    qcov = entry.get('query_cov', 0.0)
    tcov = entry.get('subject_cov', 0.0)
    fident = entry.get('identity', 0.0)
    try:
        evalue = float(entry.get('evalue', 1.0))
    except (TypeError, ValueError):
        evalue = 1.0

    if structures or prostt5_confidence is None:
        if qcov > 0.8 and tcov > 0.8 and (fident > 0.3 or evalue < 1e-10):
            return 'high'
        if (qcov > 0.8 or tcov > 0.8) and (fident > 0.3 or evalue < 1e-5):
            return 'medium'
        return 'low'

    # ProstT5 path: mean 3Di confidence factors in
    if qcov > 0.8 and tcov > 0.8 and (fident > 0.3 or prostt5_confidence > 60 or evalue < 1e-10):
        return 'high'
    if (qcov > 0.8 or tcov > 0.8) and (fident > 0.3 or 45 <= prostt5_confidence <= 60) and evalue < 1e-5:
        return 'medium'
    return 'low'

`attach_prostt5_confidence(features, mean_probs_csv, has_duplicate_locus=False)`

Attach each CDS's mean ProstT5 confidence (0-100) from the {prefix}_prostT5_3di_mean_probabilities.csv file (cds_id,mean_prob, no header) as feature['prostt5_confidence']. No-op if the file is absent (e.g. structure input).

Source code in src/baktfold/bakta/annotation.py

def attach_prostt5_confidence(features: Sequence[dict], mean_probs_csv, has_duplicate_locus: bool = False) -> None:
    """Attach each CDS's mean ProstT5 confidence (0-100) from the
    ``{prefix}_prostT5_3di_mean_probabilities.csv`` file (``cds_id,mean_prob``,
    no header) as ``feature['prostt5_confidence']``. No-op if the file is absent
    (e.g. structure input)."""
    confidence = {}
    try:
        with open(mean_probs_csv) as fh:
            for line in fh:
                parts = line.rstrip('\n').split(',')
                if len(parts) >= 2:
                    try:
                        confidence[parts[0]] = float(parts[1])
                    except ValueError:
                        continue
    except FileNotFoundError:
        return

    for feat in features:
        key = feat.get('id') if has_duplicate_locus else feat.get('locus')
        if key in confidence:
            feat['prostt5_confidence'] = confidence[key]

`calc_annotation_score(orf)`

Calculates the annotation score for a given ORF.

Parameters:

Name	Type	Description	Default
`orf`	`dict`	The ORF to calculate the annotation score for.	required

Returns:

Name	Type	Description
`int`	`int`	The annotation score for the given ORF.

Examples:

>>> calc_annotation_score(orf)

Source code in src/baktfold/bakta/annotation.py

def calc_annotation_score(orf:dict) -> int:
    """
    Calculates the annotation score for a given ORF.

    Args:
      orf (dict): The ORF to calculate the annotation score for.

    Returns:
      int: The annotation score for the given ORF.

    Examples:
      >>> calc_annotation_score(orf)
    """
    score = 0
    if(orf.get('gene', None)):
        score += 1
    if(orf.get('product', None)):
        score += 1
    return score

`combine_annotation(feature, fast, structures=False)`

Combines annotation information from different sources into a single feature.

Parameters:

Name	Type	Description	Default
`feature`	`dict`	The feature to combine annotation for.	required
`fast`	`bool`	If True, skips AFDB	required
`structures`	`bool`	True for structure input (no ProstT5 confidence available)	`False`

Returns:

Type	Description
	None

Examples:

>>> combine_annotation(feature)

Source code in src/baktfold/bakta/annotation.py

def combine_annotation(feature: dict, fast: bool, structures: bool = False):
    """
    Combines annotation information from different sources into a single feature.

    Args:
      feature (dict): The feature to combine annotation for.
      fast (bool): If True, skips AFDB
      structures (bool): True for structure input (no ProstT5 confidence available)
    Returns:
      None

    Examples:
      >>> combine_annotation(feature)
    """


    # ups = feature.get('ups', None)
    # ips = feature.get('ips', None)
    # psc = feature.get('psc', None)
    # pscc = feature.get('pscc', None)
    pstc = feature.get('pstc', None)
    # expert_hits = feature.get('expert', [])

    # gene = None
    # genes = set()
    # product = None

    product = feature.get('product', None)
    db_xrefs = feature.get('db_xrefs', [])

    if(pstc):

        # Always normalize pstc to a list
        if isinstance(pstc, dict):
            pstc = [pstc]
        elif isinstance(pstc, str):
            pstc = [pstc]

        # afdb
        afdb_entry = None if fast else next(
            (p for p in pstc if isinstance(p, dict) and p.get('source') == 'afdb'),
            None
        )
        # swissprot
        swissprot_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'swissprot'), None)
        # pdb
        pdb_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'pdb'), None)
        # cath
        cath_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'cath'), None)
        # custom
        custom_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'custom_db'), None)

        ####
        # hierarchy
        # if it exists, custom is at the top
        # custom
        # if not
        # 1. SwissProt
        # 2. AFDB
        # 3. PDB
        # 4. CATH
        ####

        # winning hit by hierarchy: custom > swissprot > afdb > pdb > cath
        selected_entry = custom_entry or swissprot_entry or afdb_entry or pdb_entry or cath_entry
        pstc_product = selected_entry['description'] if selected_entry else None

        if(pstc_product):
            product = pstc_product

        # Collect all db_xref IDs
        for entry in pstc:
            if isinstance(entry, dict):
                src = entry.get('source', '').lower()
                eid = entry.get('id')
                if eid:
                    if src == 'afdb':
                        if not fast:
                            db_xrefs.append(f"afdb_v6:afdbclusters_{eid}")
                    elif src == 'swissprot':
                        db_xrefs.append(f"afdb_v6:swissprot_{eid}")
                    elif src == 'pdb':
                        db_xrefs.append(f"pdb:pdb_{eid}")
                    elif src == 'cath':
                        db_xrefs.append(f"cath:cath_{eid}")
                    elif src == 'custom_db':
                        db_xrefs.append(f"custom:custom_{eid}")
                    else:
                        db_xrefs.append(eid)
            elif isinstance(entry, str):
                # Preserve any existing string cross-references
                db_xrefs.append(entry)

        # mark as baktfold
        mark_as_baktfold(feature)

        # annotation confidence (high/medium/low) from the hierarchy-selected
        # hit. Surfaced in JSON + TSV only (not gff3/gbff/embl). For structure
        # input also carry through the Foldseek TM-score and LDDT.
        if selected_entry is not None:
            feature['annotation_confidence'] = assign_annotation_confidence(
                selected_entry, feature.get('prostt5_confidence'), structures
            )
            if structures:
                if 'tmscore' in selected_entry:
                    feature['tmscore'] = selected_entry['tmscore']
                if 'lddt' in selected_entry:
                    feature['lddt'] = selected_entry['lddt']




    # if(len(expert_hits) > 0):
    #     top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
    #     expert_genes = top_expert_hit.get('gene', None)
    #     if(expert_genes):
    #         expert_genes = expert_genes.replace('/', ',').split(',')
    #         genes.update(expert_genes)
    #         gene = expert_genes[0]
    #     product = top_expert_hit.get('product', None)
    #     for hit in expert_hits:
    #         db_xrefs.update(hit.get('db_xrefs', []))

    if product and "hypothetical protein" not in product.lower():
        product = revise_cds_product(product)
        if(product):
            if(cfg.compliant):
                product = insdc.revise_product_insdc(product)
            feature['product'] = product

            unmark_as_hypothetical(feature)

            # protein_gene_symbol = extract_protein_gene_symbol(product)
            # if(protein_gene_symbol):
            #     genes.add(protein_gene_symbol)
            # revised_genes = revise_cds_gene_symbols(genes)
            # revised_gene = None
            # if gene is not None:
            #     revised_gene = revise_cds_gene_symbols([gene])  # special treatment for selected gene symbol
            #     revised_gene = revised_gene[0] if len(revised_gene) > 0 else None
            # if(revised_gene is None  and  len(revised_genes) >= 1):  # select first from gene symbol list if no symbol was selected before
            #     revised_gene = revised_genes[0]

            # feature['gene'] = revised_gene
            # feature['genes'] = sorted(revised_genes)
        else:
            mark_as_hypothetical(feature)
    else:
        mark_as_hypothetical(feature)

    feature['db_xrefs'] = sorted(list(db_xrefs))

`extract_protein_gene_symbol(product)`

Extracts a valid gene symbol from a protein name.

Parameters:

Name	Type	Description	Default
`product`	`str`	The protein name to extract a gene symbol from.	required

Returns:

Name	Type	Description
`str`	`str`	The extracted gene symbol.

Examples:

>>> extract_protein_gene_symbol(product)

Source code in src/baktfold/bakta/annotation.py

def extract_protein_gene_symbol(product: str) -> str:
    """
    Extracts a valid gene symbol from a protein name.

    Args:
      product (str): The protein name to extract a gene symbol from.

    Returns:
      str: The extracted gene symbol.

    Examples:
      >>> extract_protein_gene_symbol(product)
    """
    gene_symbols = []
    for part in product.split(' '):  # try to extract valid gene symbols
        m = RE_GENE_SYMBOL.fullmatch(part)
        if(m):
            symbol = m[0]
            logger.info('fix gene: extract symbol from protein name. symbol=%s', symbol)
            gene_symbols.append(symbol)
        else:
            m = RE_PROTEIN_SYMBOL.fullmatch(part)  # extract protein names
            if(m):
                symbol = m[0]
                symbol = symbol[0].lower() + symbol[1:]
                logger.info('fix gene: extract symbol from protein name. symbol=%s', symbol)
                gene_symbols.append(symbol)
    if(len(gene_symbols) == 0):  # None found
        return None
    elif(len(gene_symbols) == 1):  # found 1
        return gene_symbols[0]
    else:  # found more than one, take the 2nd as the 1st often describes a broader gene family like "xyz family trancsriptional regulator ..."
        return gene_symbols[1]

`mark_as_baktfold(feature)`

Adds the baktfold key to the given feature dictionary.

Parameters:

Name	Type	Description	Default
`feature`	`dict`	The feature dictionary to add the baktfold key to.	required

Returns:

Type	Description
	None

Examples:

>>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
>>> mark_as_baktfold(feature)
>>> feature
{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+', 'baktfold': True}

Source code in src/baktfold/bakta/annotation.py

def mark_as_baktfold(feature: dict):
    """
    Adds the baktfold key to the given feature dictionary.

    Args:
      feature (dict): The feature dictionary to add the baktfold key to.

    Returns:
      None

    Examples:
      >>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
      >>> mark_as_baktfold(feature)
      >>> feature
      {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+', 'baktfold': True}
    """
    # logger.info(
    #     f'baktfold found hit(s) for: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature['baktfold'] = True

`mark_as_hypothetical(feature)`

Marks a feature as hypothetical.

Parameters:

Name	Type	Description	Default
`feature`	`dict`	The feature to mark as hypothetical.	required

Returns:

Type	Description
	None

Examples:

>>> mark_as_hypothetical(feature)

Source code in src/baktfold/bakta/annotation.py

def mark_as_hypothetical(feature: dict):
    """
    Marks a feature as hypothetical.

    Args:
      feature (dict): The feature to mark as hypothetical.

    Returns:
      None

    Examples:
      >>> mark_as_hypothetical(feature)
    """
    # no need to actually print this I think
    # logger.info(
    #     f'marked as hypothetical: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature['hypothetical'] = True
    feature['gene'] = None
    feature['genes'] = []
    feature['product'] = bc.HYPOTHETICAL_PROTEIN

`revise_cds_gene_symbols(raw_genes)`

Revises a list of gene symbols to ensure they are valid.

Parameters:

Name	Type	Description	Default
`raw_genes`	`Sequence[str]`	The list of gene symbols to revise.	required

Returns:

Name	Type	Description
`list`		The revised list of gene symbols.

Examples:

>>> revise_cds_gene_symbols(raw_genes)

Source code in src/baktfold/bakta/annotation.py

def revise_cds_gene_symbols(raw_genes: Sequence[str]):
    """
    Revises a list of gene symbols to ensure they are valid.

    Args:
      raw_genes (Sequence[str]): The list of gene symbols to revise.

    Returns:
      list: The revised list of gene symbols.

    Examples:
      >>> revise_cds_gene_symbols(raw_genes)
    """
    revised_genes = set()
    for gene in raw_genes:
        old_gene = gene
        if(RE_GENE_SUSPECT_CHARS.search(gene)):  # check for suspect characters -> remove gene symbol
            logger.info('fix gene: remove gene symbol containing suspect chars. old=%s', old_gene)
            continue

        old_gene = gene
        gene = gene.replace('gene', '')
        if(gene != old_gene):  # remove gene literal
            logger.info('fix gene: remove gene literal. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        if(gene[-1] == '-'):  # remove orphan hyphen
            gene = gene[:-1]
            logger.info('fix gene: remove orphan hypen. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        gene = RE_MULTIWHITESPACE.sub(' ', gene).strip()  # revise whitespaces
        if(gene != old_gene):
            logger.info('fix gene: revise whitespaces. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        if(RE_GENE_CAPITALIZED.fullmatch(gene)):
            gene = gene[0].lower() + gene[1:]
            logger.info('fix gene: lowercase first char. new=%s, old=%s', gene, old_gene)

        if(len(gene) >= 3):
            if(len(gene) <= 12):
                revised_genes.add(gene)
            else:
                old_gene = gene
                gene = extract_protein_gene_symbol(gene)
                if(gene):
                    revised_genes.add(gene)
    return list(revised_genes)

`revise_cds_product(product)`

Revise product name for INSDC compliant submissions

Source code in src/baktfold/bakta/annotation.py

def revise_cds_product(product: str):
    """Revise product name for INSDC compliant submissions"""

    # from gb 
    # grep "Uncharacterized protein" AFDBClusters.tsv | wc -l
    #     805448

    if "Uncharacterized protein" in product:
        old_product = product
        product = "hypothetical protein"
        if product != old_product:
            logger.info(f'fix product: renamed uncharacterized protein as hypothetical. new={product}, old={old_product}')

    # from bakta

    old_product = product
    product = RE_PROTEIN_WEIGHT.sub(' ', product)  # remove protein weight in (k)Da
    if(product != old_product):
        logger.info('fix product: remove protein weight in (k)Da. new=%s, old=%s', product, old_product)

    old_product = product
    product = re.sub(RE_PROTEIN_PERIOD_SEPARATOR, r'\1-\2', product)  # replace separator periods
    if(product != old_product):
        logger.info('fix product: replace separator periods. new=%s, old=%s', product, old_product)

    old_product = product
    if(product[0] in RE_PROTEIN_SUSPECT_CHARS_BEGINNING):  # remove suspect first character
        product = product[1:]
        logger.info('fix product: replace invalid first character. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_SUSPECT_CHARS_DISCARD.sub('', product)  # remove suspect characters
    if(product != old_product):
        logger.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_SUSPECT_CHARS_REPLACE.sub(' ', product)  # replace suspect characters by single whitespace
    if(product != old_product):
        logger.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_WRONG_PRIMES.sub('\u0027', product)  # replace wrong prime characters with single quote (U+0027) (') according to https://www.ncbi.nlm.nih.gov/genome/doc/internatprot_nomenguide/
    if(product != old_product):
        logger.info('fix product: replace wrong prime characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = product.replace('FOG:', '')  # remove FOG ids
    if(product != old_product):
        logger.info('fix product: replace FOG ids. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_REMNANT.sub('', product)  # remove 'Remnant of's
    if(product != old_product):
        logger.info('fix product: replace remnant ofs. new=%s, old=%s', product, old_product)

    old_product = product
    dufs = []  # replace DUF-containing products
    for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product):
        dufs.append(m.group(1).upper())
    if(len(dufs) >= 1):
        product = f"{' '.join(dufs)} domain{'s' if len(dufs) > 1 else ''}-containing protein"
        if(product != old_product):
            logger.info('fix product: revise DUF. new=%s, old=%s', product, old_product)

    old_product = product
    if('conserved' in product.lower()):  # replace conserved UPF proteins
        upfs = []
        for m in RE_UNCHARACTERIZED_PROTEIN_FAMILY.finditer(product):
            upfs.append(m.group(1).upper())
        if(len(upfs) >= 1):
            product = f"{' '.join(upfs)} protein"
            if(product != old_product):
                logger.info('fix product: revise UPF. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_HOMOLOG.sub('-like protein', product)  # replace Homologs
    if(product != old_product):
        if(product.count('protein') == 2):
            product = product.replace('protein', '', 1)  # remove former protein term if existing
        logger.info('fix product: replace Homolog. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_MULTIWHITESPACE.sub(' ', product).strip()  # revise whitespaces
    if(product != old_product):
        logger.info('fix product: revise whitespaces. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_PUTATIVE.sub('putative', product)  # replace putative synonyms)
    if(product != old_product):
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(RE_PROTEIN_DOMAIN_CONTAINING.search(product)):  # replace domain name underscores in domain names
        product = product.replace('_', '-')
        if(product != old_product):
            logger.info('fix product: replace domain name underscores. new=%s, old=%s', product, old_product)

    old_product = product
    if(RE_PROTEIN_TMRNA.fullmatch(product)):
        product = ''
        logger.info('fix product: discard pure tmRNA product descriptions. new=%s, old=%s', product, old_product)

    old_product = product
    if(
        RE_PROTEIN_CONTIG.search(product) or  # protein containing 'sequence'
        RE_PROTEIN_NODE.search(product) or  # potential contig name (SPAdes)
        RE_PROTEIN_POTENTIAL_CONTIG_NAME.search(product) or  # potential contig name (SPAdes)
        RE_PROTEIN_NO_LETTERS.fullmatch(product)  # no letters -> set to Hypothetical
        ):  # remove suspect products and mark as hypothetical
        product = None
        logger.info('remove product: mark proteins with suspect products as hypothetical. old=%s', old_product)

    return product

`unmark_as_hypothetical(feature)`

Removes the hypothetical key from the given feature dictionary.

Parameters:

Name	Type	Description	Default
`feature`	`dict`	The feature dictionary to remove the hypothetical key from.	required

Returns:

Type	Description
	None

Examples:

>>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
>>> unmark_as_hypothetical(feature)
>>> feature
{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}

Source code in src/baktfold/bakta/annotation.py

def unmark_as_hypothetical(feature: dict):
    """
    Removes the hypothetical key from the given feature dictionary.

    Args:
      feature (dict): The feature dictionary to remove the hypothetical key from.

    Returns:
      None

    Examples:
      >>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
      >>> unmark_as_hypothetical(feature)
      >>> feature
      {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
    """
    # logger.info(
    #     f'unmarked as hypothetical: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature.pop('hypothetical', None)  # remove completely

`check_content_size(file_name, file_path)`

Checks if a file is empty.

Parameters:

Name	Type	Description	Default
`file_name`	`str`	The name of the file to check.	required
`file_path`	`Path`	The path to the file to check.	required

Returns:

Type	Description
	None.

Examples:

>>> check_content_size('file', Path('path/to/file'))
None

Source code in src/baktfold/bakta/config.py

def check_content_size(file_name: str, file_path: Path):
    """
    Checks if a file is empty.

    Args:
      file_name (str): The name of the file to check.
      file_path (Path): The path to the file to check.

    Returns:
      None.

    Examples:
      >>> check_content_size('file', Path('path/to/file'))
      None
    """
    if(file_path.stat().st_size == 0):
        log.error('empty %s file! path=%s', file_name, file_path)
        sys.exit(f'ERROR: {file_name} file ({file_path}) is empty!')

`check_db_path(args)`

Checks the path to the database.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	The arguments passed to the program.	required

Returns:

Name	Type	Description
`Path`	`Path`	The path to the database.

Examples:

>>> check_db_path(args)
Path('path/to/db')

Source code in src/baktfold/bakta/config.py

def check_db_path(args: Namespace) -> Path:
    """
    Checks the path to the database.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the database.

    Examples:
      >>> check_db_path(args)
      Path('path/to/db')
    """
    global db_path
    env = os.environ.copy()
    if(args.db):
        db_dir = args.db
        log.debug('test parameter db: db_tmp=%s', db_dir)
        try:
            db_tmp_path = Path(db_dir).resolve()
            if(db_tmp_path.is_dir()):
                db_path = db_tmp_path
                log.info('database: type=parameter, path=%s', db_path)
            else:
                log.error('unvalid database path: type=parameter, path=%s', db_tmp_path)
                raise IOError()
        except Exception:
            sys.exit(f'ERROR: wrong database path! --db={db_dir}')
    elif('BAKTA_DB' in env):
        db_dir = env['BAKTA_DB']
        log.debug('test env db: db_tmp=%s', db_dir)
        try:
            db_tmp_path = Path(db_dir).resolve()
            if(db_tmp_path.is_dir()):
                db_path = db_tmp_path
                log.info('database: type=environment, path=%s', db_path)
            else:
                log.error('unvalid database path: type=environment, path=%s', db_tmp_path)
                raise IOError()
        except Exception:
            sys.exit(f'ERROR: wrong database path! BAKTA_DB={db_dir}')
    else:
        base_dir = Path(__file__).parent
        db_tmp_path = base_dir.joinpath('db')
        log.debug('test base_dir db: db_tmp=%s', db_tmp_path)
        if(db_tmp_path.is_dir()):
            db_path = db_tmp_path
            log.info('database: type=base-dir, path=%s', db_path)
        else:
            log.error('unvalid database path: type=base-dir, path=%s', db_tmp_path)
            sys.exit('ERROR: database neither provided nor auto-detected!\nPlease, download the mandatory db and provide it via either the --db parameter, a BAKTA_DB environment variable or copy it into the Bakta base directory.\nFor further information please read the readme.md')
    return db_path

`check_output_path(output, force_override)`

Check provided output path

Parameters:

Name	Type	Description	Default
`output`	`string`	The output directory destination path	required
`force_override`	`Bool`	Whether to override existing output directories	required

Source code in src/baktfold/bakta/config.py

def check_output_path(output: str, force_override: bool) -> Path:
    """Check provided output path
    Args:
        output (string): The output directory destination path
        force_override (Bool): Whether to override existing output directories
    """
    global output_path
    output_path = Path(output)
    if(not output_path.exists()):
        try:
            output_path.mkdir(parents=True, exist_ok=True)
        except Exception:
            sys.exit(f'ERROR: could not resolve or create output directory ({output})!')
    else:
        if(output_path == Path(os.getcwd())):
            pass
        elif(force_override is False):
            sys.exit(f'ERROR: output path ({output_path}) already exists! Either provide a non-existent new path or force overwriting it via \'--force\'')
        elif(not os.access(str(output_path), os.X_OK)):
            sys.exit(f'ERROR: output path ({output_path}) not accessible!')
        elif(not os.access(str(output_path), os.W_OK)):
            sys.exit(f'ERROR: output path ({output_path}) not writable!')
    output_path = output_path.resolve()
    return output_path

`check_readability(file_name, file_Path)`

Checks if a file is readable.

Parameters:

Name	Type	Description	Default
`file_name`	`str`	The name of the file to check.	required
`file_Path`	`Path`	The path to the file to check.	required

Returns:

Type	Description
	None.

Examples:

>>> check_readability('file', Path('path/to/file'))
None

Source code in src/baktfold/bakta/config.py

def check_readability(file_name: str, file_Path: Path):
    """
    Checks if a file is readable.

    Args:
      file_name (str): The name of the file to check.
      file_Path (Path): The path to the file to check.

    Returns:
      None.

    Examples:
      >>> check_readability('file', Path('path/to/file'))
      None
    """
    if(not os.access(str(file_Path), os.R_OK)):
        log.error('%s file not readable! path=%s', file_name, file_Path)
        sys.exit(f'ERROR: {file_name} file ({file_Path}) not readable!')

`check_threads(args)`

Checks the number of threads to use.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	The arguments passed to the program.	required

Returns:

Name	Type	Description
`int`	`int`	The number of threads to use.

Examples:

>>> check_threads(args)
4

Source code in src/baktfold/bakta/config.py

def check_threads(args: Namespace) -> int:
    """
    Checks the number of threads to use.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      int: The number of threads to use.

    Examples:
      >>> check_threads(args)
      4
    """
    global threads
    threads = args.threads

    try:
        max_threads = len(os.sched_getaffinity(0))
        log.debug(f"max-threads={max_threads}")
    except AttributeError:
        max_threads = mp.cpu_count()
        log.debug(f"scheduler affinity not availabe! max-threads={max_threads}")

    if(threads == 0):
        threads = max_threads
        log.debug("request max threads.")
    elif(threads < 0):
        log.error("wrong argument for 'threads' parameter! threads=%i", threads)
        sys.exit(f"ERROR: wrong argument ({threads}) for 'threads' parameter! Value must be larger than/equal to 0.")
    elif(threads > max_threads):
        log.error("wrong argument for 'threads' parameter! More threads requested than available: requested=%i, available=%i", threads, max_threads)
        sys.exit(f"ERROR: wrong argument ({threads}) for 'threads' parameter! More threads requested ({threads}) than available ({max_threads}).")
    log.info('threads=%i', threads)
    return threads

`check_tmp_path(args)`

Checks the path to the temporary directory.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	The arguments passed to the program.	required

Returns:

Name	Type	Description
`Path`	`Path`	The path to the temporary directory.

Examples:

>>> check_tmp_path(args)
Path('path/to/tmp_dir')

Source code in src/baktfold/bakta/config.py

def check_tmp_path(args: Namespace) -> Path:
    """
    Checks the path to the temporary directory.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the temporary directory.

    Examples:
      >>> check_tmp_path(args)
      Path('path/to/tmp_dir')
    """
    global tmp_path
    if(args.tmp_dir is not None):
        tmp_path = Path(args.tmp_dir)
        if(not tmp_path.exists()):
            log.debug('dedicated temp dir does not exist! tmp-dir=%s', tmp_path)
            sys.exit(f'ERROR: dedicated temporary directory ({tmp_path}) does not exist!')
        else:
            log.info('use dedicated temp dir: path=%s', tmp_path)
            tmp_path = Path(tempfile.mkdtemp(dir=str(tmp_path))).resolve()
    else:
        tmp_path = Path(tempfile.mkdtemp()).resolve()
    log.info('tmp-path=%s', tmp_path)
    return tmp_path

`check_user_proteins(args)`

Checks the path to the user proteins file.

Parameters:

Name	Type	Description	Default
`args`	`Namespace`	The arguments passed to the program.	required

Returns:

Name	Type	Description
`Path`		The path to the user proteins file.

Examples:

>>> check_user_proteins(args)
Path('path/to/user_proteins')

Source code in src/baktfold/bakta/config.py

def check_user_proteins(args: Namespace):
    """
    Checks the path to the user proteins file.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the user proteins file.

    Examples:
      >>> check_user_proteins(args)
      Path('path/to/user_proteins')
    """
    global user_proteins
    user_proteins = args.proteins
    if(user_proteins is not None):
        try:
            if(user_proteins == ''):
                raise ValueError('File path argument must be non-empty')
            user_proteins_path = Path(args.proteins).resolve()
            check_readability('user proteins', user_proteins_path)
            check_content_size('user proteins', user_proteins_path)
            user_proteins = user_proteins_path
            log.info('user-proteins=%s', user_proteins)
            return user_proteins
        except Exception:
            log.error('provided user proteins file not valid! path=%s', user_proteins)
            sys.exit(f'ERROR: user proteins file ({user_proteins}) not valid!')
    else:
        return None

`setup(args)`

Test environment and build a runtime configuration.

Source code in src/baktfold/bakta/config.py

def setup(args):
    """Test environment and build a runtime configuration."""
    # runtime configurations
    global env, threads, verbose, debug
    env['BLAST_USAGE_REPORT'] = 'false'  # prevent BLAST from contacting NCBI

    threads = check_threads(args)
    verbose = args.verbose
    log.info('verbose=%s', verbose)
    debug = args.debug
    log.info('debug=%s', debug)
    if(debug):
        verbose = True

    # input / output path configurations
    global db_path, db_info, tmp_path, genome_path, min_sequence_length, prefix, output_path, force
    db_path = check_db_path(args)
    tmp_path = check_tmp_path(args)

    try:
        if(args.genome == ''):
            raise ValueError('File path argument must be non-empty')
        genome_path = Path(args.genome).resolve()
        check_readability('genome', genome_path)
        check_content_size('genome', genome_path)
    except Exception:
        log.error('provided genome file not valid! path=%s', args.genome)
        sys.exit(f'ERROR: genome file ({args.genome}) not valid!')
    log.info('genome-path=%s', genome_path)

    # input / output configurations
    min_sequence_length = args.min_contig_length
    if(min_sequence_length <= 0):
        log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_sequence_length)
        sys.exit(f"ERROR: wrong argument ({min_sequence_length}) for 'min- contig-length' parameter! Value must be larger than 0")
    log.info('min_contig_length=%s', min_sequence_length)
    log.info('prefix=%s', prefix)  # set in main.py before global logger config
    log.info('output-path=%s', output_path)
    force = args.force
    log.info('force=%s', force)

    # organism configurations
    global genus, species, strain, plasmid, taxon
    genus = args.genus
    if(genus is not None):
        genus = genus.strip()
        if(genus == ''):
            log.error("Empty 'genus' parameter! genus=%s", genus)
            sys.exit(f"ERROR: empty 'genus' parameter!")
        else:
            genus = genus.capitalize()
    log.info('genus=%s', genus)
    species = args.species
    if(species is not None):
        species = species.strip()
        if(species == ''):
            log.error("Empty 'species' parameter! species=%s", species)
            sys.exit(f"ERROR: empty 'species' parameter!")
        else:
            species = species.lower()
    log.info('species=%s', species)
    strain = args.strain
    if(strain is not None):
        strain = strain.strip()
        if(strain == ''):
            log.error("Empty 'strain' parameter! strain=%s", species)
            sys.exit(f"ERROR: empty 'strain' parameter!")
    log.info('strain=%s', strain)
    plasmid = args.plasmid
    if(plasmid is not None):
        plasmid = plasmid.strip()
        if(plasmid == ''):
            log.error("Empty 'plasmid' parameter! plasmid=%s", plasmid)
            sys.exit(f"ERROR: empty 'plasmid' parameter!")
        elif('plasmid' in plasmid.lower()):
            log.error("Wrong 'plasmid' parameter! plasmid=%s", plasmid)
            sys.exit(f"ERROR: wrong 'plasmid' parameter! The plasmid name mustn't contain the word 'plasmid'.")
        elif(PLASMID_NAME_PATTERN.fullmatch(plasmid) is None and PLASMID_UNNAMED_PATTERN.fullmatch(plasmid) is None):
            log.error("Wrong 'plasmid' name! plasmid=%s", plasmid)
            sys.exit(f"ERROR: wrong 'plasmid' name! Plasmid names must either be named as 'unnamed', 'unnamed1', ... or start with a lower 'p', contain only digits, dots, underscores and letters, and are limited to 20 characters in total.")
    log.info('plasmid=%s', plasmid)
    taxon = ' '.join([t for t in [genus, species, strain] if t is not None])
    if(taxon == ''):
        taxon = None

    # annotation configurations
    global complete, prodigal_tf, translation_table, keep_sequence_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
    complete = args.complete
    log.info('complete=%s', complete)
    prodigal_tf = args.prodigal_tf
    if(prodigal_tf is not None):
        try:
            if(prodigal_tf == ''):
                raise ValueError('File path argument must be non-empty')
            prodigal_tf_path = Path(args.prodigal_tf).resolve()
            check_readability('prodigal training', prodigal_tf_path)
            check_content_size('prodigal training', prodigal_tf_path)
            prodigal_tf = prodigal_tf_path
        except Exception:
            log.error('provided prodigal training file not valid! path=%s', prodigal_tf)
            sys.exit(f'ERROR: Prodigal training file ({prodigal_tf}) not valid!')
    log.info('prodigal_tf=%s', prodigal_tf)
    translation_table = args.translation_table
    log.info('translation_table=%s', translation_table)
    gram = args.gram
    log.info('gram=%s', gram)
    compliant = args.compliant
    log.info('compliant=%s', compliant)
    if(compliant):
        min_sequence_length = 200
        log.info('compliant mode! min_contig_length=%s', min_sequence_length)
    meta = args.meta
    log.info('meta=%s', meta)
    locus = args.locus
    if(locus is not None):
        if(locus == ''):
            log.error("Empty 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: empty 'locus' parameter!")
        if(' ' in locus):
            log.error("Whitespace character in 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: whitespace character ({locus}) in 'locus' parameter!")
        if(bc.RE_INSDC_ID_PREFIX.fullmatch(locus) is None):
            log.error("Invalid 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric or '-_' characters.")
    log.info('locus=%s', locus)
    locus_tag = args.locus_tag
    if(locus_tag is not None):
        if(locus_tag == ''):
            log.error("Empty 'locus-tag' parameter! locus=%s", locus_tag)
            sys.exit(f"ERROR: empty 'locus-tag' parameter!")
        if(' ' in locus_tag):
            log.error("Whitespace character in 'locus-tag' parameter! locus-tag=%s", locus_tag)
            sys.exit(f"ERROR: whitespace character ({locus_tag}) in 'locus-tag' parameter!")
        if(compliant):
            if(bc.RE_INSDC_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
                log.error("INSDC-incompliant 'locus-tag' parameter! locus-tag=%s", locus_tag)
                sys.exit(f"ERROR: INSDC-incompliant 'locus-tag' parameter ({locus_tag})!\nINSDC Locus tag prefixes must contain between 3 and 12 alphanumeric uppercase characters and start with a letter.")
        else:
            if(bc.RE_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
                log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
                sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.")
    log.info('locus-tag=%s', locus_tag)
    locus_tag_increment = args.locus_tag_increment
    log.info('locus-tag-increment=%s', locus_tag_increment)
    keep_sequence_headers = args.keep_contig_headers
    log.info('keep_contig_headers=%s', keep_sequence_headers)
    replicons = args.replicons
    if(replicons is not None):
        try:
            if(replicons == ''):
                raise ValueError('File path argument must be non-empty')
            replicon_table_path = Path(args.replicons).resolve()
            check_readability('replicon table', replicon_table_path)
            check_content_size('replicon table', replicon_table_path)
            replicons = replicon_table_path
        except Exception:
            log.error('provided replicon file not valid! path=%s', replicons)
            sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
    log.info('replicon-table=%s', replicons)
    user_proteins = check_user_proteins(args)
    user_hmms = args.hmms
    if(user_hmms is not None):
        try:
            if(user_hmms == ''):
                raise ValueError('File path argument must be non-empty')
            user_hmms_path = Path(user_hmms).resolve()
            check_readability('HMM', user_hmms_path)
            check_content_size('HMM', user_hmms_path)
            user_hmms = user_hmms_path
        except Exception:
            log.error('provided HMM file not valid! path=%s', user_hmms)
            sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')

    regions = args.regions
    if(regions is not None):
        try:
            if(regions == ''):
                raise ValueError('File path argument must be non-empty')
            regions_path = Path(args.regions).resolve()
            check_readability('regions', regions_path)
            check_content_size('regions', regions_path)
            regions = regions_path
        except Exception:
            log.error('provided regions file not valid! path=%s', regions)
            sys.exit(f'ERROR: regions file ({regions}) not valid!')
    log.info('regions=%s', regions)


    # workflow configurations
    global skip_trna, skip_tmrna, skip_rrna, skip_ncrna, skip_ncrna_region, skip_crispr, skip_cds, skip_pseudo, skip_sorf, skip_gap, skip_ori, skip_filter, skip_plot
    skip_trna = args.skip_trna
    log.info('skip-tRNA=%s', skip_trna)
    skip_tmrna = args.skip_tmrna
    log.info('skip-tmRNA=%s', skip_tmrna)
    skip_rrna = args.skip_rrna
    log.info('skip-rRNA=%s', skip_rrna)
    skip_ncrna = args.skip_ncrna
    log.info('skip-ncRNA=%s', skip_ncrna)
    skip_ncrna_region = args.skip_ncrna_region
    log.info('skip-ncRNA-region=%s', skip_ncrna_region)
    skip_crispr = args.skip_crispr
    log.info('skip-CRISPR=%s', skip_crispr)
    skip_cds = args.skip_cds
    log.info('skip-CDS=%s', skip_cds)
    skip_pseudo = args.skip_pseudo
    log.info('skip-pseudo=%s', skip_pseudo)
    skip_sorf = args.skip_sorf
    log.info('skip-sORF=%s', skip_sorf)
    skip_gap = args.skip_gap
    log.info('skip-gap=%s', skip_gap)
    skip_ori = args.skip_ori
    log.info('skip-ori=%s', skip_ori)
    skip_filter = args.skip_filter
    log.info('skip-filter=%s', skip_filter)
    skip_plot = args.skip_plot
    log.info('skip-plot=%s', skip_plot)

`fetch_db_pscc_result(conn, uniref50_id)`

Fetches the PSCC result for a given uniref50_id from a sqlite3 database.

Parameters:

Name	Type	Description	Default
`conn`	`sqlite3.Connection`	The connection to the sqlite3 database.	required
`uniref50_id`	`str`	The uniref50_id to fetch the PSCC result for.	required

Returns:

Name	Type	Description
`tuple`		The PSCC result for the given uniref50_id.

Source code in src/baktfold/bakta/pstc.py

def fetch_db_pscc_result(conn: sqlite3.Connection, uniref50_id: str):
    """
    Fetches the PSCC result for a given uniref50_id from a sqlite3 database.

    Args:
      conn (sqlite3.Connection): The connection to the sqlite3 database.
      uniref50_id (str): The uniref50_id to fetch the PSCC result for.

    Returns:
      tuple: The PSCC result for the given uniref50_id.
    """
    c = conn.cursor()
    c.execute('select * from pscc where uniref50_id=?', (uniref50_id,))
    rec = c.fetchone()
    c.close()
    return rec

`fetch_sql_description(conn, source, accession)`

Fetches the product description for a given source and accession from a sqlite3 database.

Parameters:

Name	Type	Description	Default
`conn`	`sqlite3.Connection`	The connection to the sqlite3 database.	required
`source`	`str`	The source of the accession.	required
`accession`	`str`	The accession to fetch the description for.	required

Returns:

Name	Type	Description
`str`		The product description for the given source and accession.

Source code in src/baktfold/bakta/pstc.py

def fetch_sql_description(conn, source, accession):
    """
    Fetches the product description for a given source and accession from a sqlite3 database.

    Args:
      conn (sqlite3.Connection): The connection to the sqlite3 database.
      source (str): The source of the accession.
      accession (str): The accession to fetch the description for.

    Returns:
      str: The product description for the given source and accession.
    """
    table_map = {
        'swissprot': 'swissprot',
        'afdb': 'afdbclusters',
        'pdb': 'pdb',
        'cath': 'cath',
    }

    table = table_map.get(source)
    if table is None:
        return None

    # special case for cath, which can have multiple top hits (greedy) - multidomain proteins
    if table == 'cath':
        cursor = conn.execute("SELECT product FROM cath WHERE id = ?", (accession,))
    else:
        cursor = conn.execute(f"SELECT product FROM {table} WHERE id = ?", (accession,))

    row = cursor.fetchone()
    return row[0] if row else None

`lookup_custom(features, baktfold_db, custom_annotations)`

Lookup PSTC information from custom db

Source code in src/baktfold/bakta/pstc.py

def lookup_custom(features: Sequence[dict], baktfold_db: Path, custom_annotations: Path):
    """Lookup PSTC information from custom db """
    no_pstc_lookups = 0

    # custom
    if custom_annotations:
        custom_dict = {}
        with open(f"{custom_annotations}", "r") as f:
            reader = csv.reader(f, delimiter="\t")
            for row in reader:
                if len(row) >= 2:
                    custom_dict[row[0]] = row[1]

    for feat in features:
        pstc = feat.get('pstc')
        if not pstc:
            continue

        # Normalize to list for consistent handling
        pstc_entries = pstc if isinstance(pstc, list) else [pstc]

        for entry in pstc_entries:
            accession = entry.get('id')
            source = entry.get('source')
            if source == 'custom_db':
                if accession in custom_dict:
                    entry['description'] = custom_dict[accession]
                else:
                    entry['description'] = accession # mark as accession if no annotation given for custom for now

        # Write back normalized list or single entry
        feat['pstc'] = pstc_entries if isinstance(pstc, list) else pstc_entries[0]

    return features

`lookup_sql(features, baktfold_db, threads)`

Resolve PSTC accessions to product descriptions from the SQLite DB.

One read-only connection is opened for the whole feature set and reused for every accession. SQLite point lookups on the indexed id column are microsecond-scale, so a single serial pass is dramatically faster than the previous design, which opened (and tore down) a brand-new connection per accession inside a ThreadPoolExecutor — thousands of connection opens for a bacterial genome, with no real parallelism since each feature's futures were collected before the next feature was submitted and most features carry a single PSTC entry. Benchmarked at ~16x faster for a 5k-CDS genome (810 ms -> 51 ms).

Each conn.execute returns its own short-lived cursor consumed immediately, so sequential CATH multi-domain lookups can't collide.

threads is accepted for signature compatibility but unused: the bottleneck was connection setup, not query execution.

Source code in src/baktfold/bakta/pstc.py

def lookup_sql(features: Sequence[dict], baktfold_db: Path, threads: int):
    """Resolve PSTC accessions to product descriptions from the SQLite DB.

    One read-only connection is opened for the whole feature set and reused
    for every accession.  SQLite point lookups on the indexed ``id`` column
    are microsecond-scale, so a single serial pass is dramatically faster
    than the previous design, which opened (and tore down) a brand-new
    connection *per accession* inside a ThreadPoolExecutor — thousands of
    connection opens for a bacterial genome, with no real parallelism since
    each feature's futures were collected before the next feature was
    submitted and most features carry a single PSTC entry. Benchmarked at
    ~16x faster for a 5k-CDS genome (810 ms -> 51 ms).

    Each ``conn.execute`` returns its own short-lived cursor consumed
    immediately, so sequential CATH multi-domain lookups can't collide.

    ``threads`` is accepted for signature compatibility but unused: the
    bottleneck was connection setup, not query execution.
    """
    logger.info("Looking up PSTC descriptions")

    db_path = baktfold_db.joinpath("baktfold.db")
    conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
    try:
        for feat in features:
            pstc = feat.get("pstc")
            if not pstc:
                continue

            # Normalize to list for consistent handling
            pstc_entries = pstc if isinstance(pstc, list) else [pstc]

            for entry in pstc_entries:
                accession = entry.get("id")
                source = entry.get("source")
                desc = fetch_sql_description(conn, source, accession)
                if desc:
                    entry["description"] = desc
                elif source == "custom_db":
                    entry["description"] = accession  # keep accession if custom_db but missing
                else:
                    entry["description"] = "hypothetical protein"

            # Write back normalized list or single entry
            feat["pstc"] = pstc_entries if isinstance(pstc, list) else pstc_entries[0]
    finally:
        conn.close()

    return features

`parse(features, foldseek_df, db_name='swissprot', has_duplicate_locus=False)`

Update CDS in place with PSTC hits from foldseek_df if they pass filters.

has_duplicate_locus - some euks have multiple CDS per locus tag

Source code in src/baktfold/bakta/pstc.py

def parse(features: Sequence[dict], foldseek_df: pl.DataFrame, db_name: str = 'swissprot', has_duplicate_locus: bool = False) -> None:
    """Update CDS in place with PSTC hits from foldseek_df if they pass filters.

    has_duplicate_locus - some euks have multiple CDS per locus tag

    """

    if foldseek_df.is_empty():
        return features

    # each query maps to a list of hit rows (to handle multiple CATH greedy
    # tophits for multidomain proteins). Single pass over the rows as dicts.
    foldseek_hits = defaultdict(list)
    for row in foldseek_df.iter_rows(named=True):
        foldseek_hits[row['query']].append(row)

    updated_count = 0


    for cds in features:
        if has_duplicate_locus:
            aa_identifier = cds.get('id')
        else:
            aa_identifier = cds.get('locus')

        if aa_identifier not in foldseek_hits:
            continue  # no hits, skip

        cds_updated = False  

        # Iterate over *all* hits for this query
        for row in foldseek_hits[aa_identifier]:
            query_cov = float(row['qCov'])
            subject_cov = float(row['tCov'])
            identity = float(row['fident'])
            evalue = float(row['evalue'])
            bitscore = float(row['bitscore'])
            target_id = row['target']

            # Extract accession depending on database
            if db_name in {"swissprot", "afdb"}:
                accession = target_id.split('-')[1]
            elif db_name == "pdb":
                accession = target_id.split('-')[0]
            else:  # cath and custom
                accession = target_id

            # Apply your filters
            if (
                query_cov >= bc.MIN_PSTC_QCOVERAGE
                and subject_cov >= bc.MIN_PSTC_TCOVERAGE
                and identity >= bc.MIN_PSTC_IDENTITY
            ):
                new_pstc = {
                    'source': db_name,
                    'id': accession,
                    'query_cov': query_cov,
                    'subject_cov': subject_cov,
                    'identity': identity,
                    'score': bitscore,
                    'evalue': evalue,
                }
                # structure-based Foldseek also reports TM-score and LDDT
                # (get_tophit adds these columns only when structures=True)
                if 'alntmscore' in row and row['alntmscore'] is not None:
                    new_pstc['tmscore'] = float(row['alntmscore'])
                if 'lddt' in row and row['lddt'] is not None:
                    new_pstc['lddt'] = float(row['lddt'])

                # Append or initialize 'pstc'
                if 'pstc' in cds:
                    if isinstance(cds['pstc'], dict):
                        cds['pstc'] = [cds['pstc'], new_pstc]
                    elif isinstance(cds['pstc'], list):
                        cds['pstc'].append(new_pstc)
                    else:
                        cds['pstc'] = [new_pstc]
                else:
                    cds['pstc'] = [new_pstc]  # ← ensure list, since we may have many hits


                cds_updated = True  

        # Increment only once per CDS that had at least one valid hit (CATH might have multiple)
        if cds_updated:
            updated_count += 1

    logger.info(f"PSTC for {db_name} updated in place for {updated_count} CDSs")
    return features

Originally taken from Michael Hall's tbpore https://github.com/mbhall88/tbpore/blob/main/tbpore/external_tools.py

Also used by a variety of other tools (Dnaapler, Plassembler, Pharokka)

`ExternalTool`

Class for running external tools.

Parameters:

Name	Type	Description	Default
`tool`	`str`	The path to the tool to run.	required
`input`	`str`	The input file.	required
`output`	`str`	The output file.	required
`params`	`str`	The parameters to pass to the tool.	required
`logdir`	`Path`	The directory to store log files.	required

Attributes:

Name	Type	Description
`command`	`List[str]`	The command to run.
`out_log`	`str`	The path to the stdout log file.
`err_log`	`str`	The path to the stderr log file.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"

Source code in src/baktfold/utils/external_tools.py

class ExternalTool:
    """
    Class for running external tools.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    def __init__(
        self,
        tool: str,
        input: str,
        output: str,
        params: str,
        logdir: Path,
        env: Optional[Dict[str, str]] = None,
    ):
        """
        Initializes an ExternalTool object.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.
          logdir (Path): The directory to store log files.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
            for multi-GPU foldseek). None == inherit unchanged.

        Attributes:
          command (List[str]): The command to run.
          out_log (str): The path to the stdout log file.
          err_log (str): The path to the stderr log file.
          env (Optional[Dict[str, str]]): Extra subprocess env vars.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command
          ["tool", "params", "output", "input"]
          >>> tool.out_log
          "logdir/tool_1234567890abcdef1234567890abcdef.out"
          >>> tool.err_log
          "logdir/tool_1234567890abcdef1234567890abcdef.err"
        """
        logdir = Path(logdir)
        self.command: List[str] = self._build_command(tool, input, output, params)
        Path(logdir).mkdir(parents=True, exist_ok=True)
        command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
        tool_name = Path(tool).name
        logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
        self.out_log = f"{logfile_prefix}.out"
        self.err_log = f"{logfile_prefix}.err"
        self.env = env

    @property
    def command_as_str(self) -> str:
        """
        Returns the command as a string.

        Returns:
          str: The command as a string.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command_as_str
          "tool params output input"
        """
        return shlex.join(self.command)

    @staticmethod
    def _build_command(tool: str, input: str, output: str, params: str) -> List[str]:
        """
        Builds the command to run.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.

        Returns:
          List[str]: The command to run.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._build_command("tool", "input", "output", "params")
          ["tool", "params", "output", "input"]
        """
        # note: shlex.join does not allow us to shlex.split() later
        # this is explicitly a " ".join()
        command = " ".join([tool, params, output, input])
        escaped_command = shlex.split(command)
        return escaped_command

    def run(self) -> None:
        """
        Runs the tool.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            if self.env:
                print(f"Extra env: {self.env}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")
            self._run_core(
                self.command,
                stdout_fh=stdout_fh,
                stderr_fh=stderr_fh,
                env=self.env,
            )
            logger.info(f"Done running {self.command_as_str}")

    """
    stream to terminal (aria2c) so the user knows how long it is taking
    """

    def run_stream(self) -> None:
        """
        Runs the tool and streams the output to the terminal.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run_stream()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")

            # ``with subprocess.Popen(...)`` guarantees stdout is closed and
            # ``wait()`` is called on every exit path including exceptions.
            # The inner ``try/except BaseException`` kills the child before
            # ``Popen.__exit__`` calls ``wait()`` — without it an unkilled
            # child can hang the wait and leave a zombie. BaseException (not
            # Exception) is intentional so Ctrl-C / SystemExit also trigger.
            with subprocess.Popen(
                self.command,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                bufsize=1,
                universal_newlines=True,
            ) as process:
                try:
                    for line in process.stdout:
                        print(line, end="")        # Live output to terminal
                        stdout_fh.write(line)      # Also write to stdout log
                except BaseException:
                    process.kill()
                    raise

            # Popen.__exit__ has called wait(); returncode is now set.
            return_code = process.returncode
            logger.info(f"Done running {self.command_as_str}")

            if return_code != 0:
                raise subprocess.CalledProcessError(return_code, self.command)


    @staticmethod
    def _run_core(
        command: List[str],
        stdout_fh,
        stderr_fh,
        env: Optional[Dict[str, str]] = None,
    ) -> None:
        """
        Runs the tool.

        Args:
          command (List[str]): The command to run.
          stdout_fh: The file handle to write stdout to.
          stderr_fh: The file handle to write stderr to.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess. None == inherit.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._run_core(["tool", "params", "output", "input"], stdout_fh, stderr_fh)
        """
        merged_env = {**os.environ, **env} if env else None
        subprocess.check_call(
            command, stdout=stdout_fh, stderr=stderr_fh, env=merged_env
        )

    @staticmethod
    def run_tools(
        tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
    ) -> None:
        """
        Runs a list of tools.

        Args:
          tools_to_run (Tuple[ExternalTool]): The list of tools to run.
          ctx (Optional[click.Context]): The click context.

        Examples:
          >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
          >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
          >>> ExternalTool.run_tools((tool1, tool2))
          >>> ExternalTool.run_tools((tool1, tool2), ctx)
        """
        for tool in tools_to_run:
            try:
                tool.run()
            except subprocess.CalledProcessError as error:
                logger.error(
                    f"Error calling {tool.command_as_str} (return code {error.returncode})"
                )
                logger.error(f"Please check stdout log file: {tool.out_log}")
                logger.error(f"Please check stderr log file: {tool.err_log}")
                logger.error("Temporary files are preserved for debugging")
                logger.error("Exiting...")

                if ctx:
                    ctx.exit(1)
                else:
                    sys.exit(1)

    """
    Only one toolf
    """

    @staticmethod
    def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_tool(tool)
          None
        """
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)


    """
    Only download - so can print the aria2c output to screen
    """

    @staticmethod
    def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool and prints the aria2c output to the screen.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_download(tool)
          None
        """
        try:
            tool.run_stream()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

`command_as_str: str` `property`

Returns the command as a string.

Returns:

Name	Type	Description
`str`	`str`	The command as a string.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command_as_str
"tool params output input"

`init(tool, input, output, params, logdir, env=None)`

Initializes an ExternalTool object.

Parameters:

Name	Type	Description	Default
`tool`	`str`	The path to the tool to run.	required
`input`	`str`	The input file.	required
`output`	`str`	The output file.	required
`params`	`str`	The parameters to pass to the tool.	required
`logdir`	`Path`	The directory to store log files.	required
`env`	`Optional[Dict[str, str]]`	Extra env vars merged with os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES for multi-GPU foldseek). None == inherit unchanged.	`None`

Attributes:

Name	Type	Description
`command`	`List[str]`	The command to run.
`out_log`	`str`	The path to the stdout log file.
`err_log`	`str`	The path to the stderr log file.
`env`	`Optional[Dict[str, str]]`	Extra subprocess env vars.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"

Source code in src/baktfold/utils/external_tools.py

def __init__(
    self,
    tool: str,
    input: str,
    output: str,
    params: str,
    logdir: Path,
    env: Optional[Dict[str, str]] = None,
):
    """
    Initializes an ExternalTool object.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.
      env (Optional[Dict[str, str]]): Extra env vars merged with
        os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
        for multi-GPU foldseek). None == inherit unchanged.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.
      env (Optional[Dict[str, str]]): Extra subprocess env vars.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    logdir = Path(logdir)
    self.command: List[str] = self._build_command(tool, input, output, params)
    Path(logdir).mkdir(parents=True, exist_ok=True)
    command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
    tool_name = Path(tool).name
    logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
    self.out_log = f"{logfile_prefix}.out"
    self.err_log = f"{logfile_prefix}.err"
    self.env = env

`run()`

Runs the tool.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run()

Source code in src/baktfold/utils/external_tools.py

def run(self) -> None:
    """
    Runs the tool.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        if self.env:
            print(f"Extra env: {self.env}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")
        self._run_core(
            self.command,
            stdout_fh=stdout_fh,
            stderr_fh=stderr_fh,
            env=self.env,
        )
        logger.info(f"Done running {self.command_as_str}")

`run_download(tool, ctx=None)` `staticmethod`

Runs the given external tool and prints the aria2c output to the screen.

Parameters:

Name	Type	Description	Default
`tool`	`ExternalTool`	The external tool to run.	required
`ctx`	`Optional[click.Context]`	The click context to use. Defaults to None.	`None`

Returns:

Type	Description
`None`	None.

Raises:

Type	Description
`subprocess.CalledProcessError`	If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_download(tool)
None

Source code in src/baktfold/utils/external_tools.py

@staticmethod
def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool and prints the aria2c output to the screen.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_download(tool)
      None
    """
    try:
        tool.run_stream()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

`run_stream()`

Runs the tool and streams the output to the terminal.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run_stream()

Source code in src/baktfold/utils/external_tools.py

def run_stream(self) -> None:
    """
    Runs the tool and streams the output to the terminal.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run_stream()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")

        # ``with subprocess.Popen(...)`` guarantees stdout is closed and
        # ``wait()`` is called on every exit path including exceptions.
        # The inner ``try/except BaseException`` kills the child before
        # ``Popen.__exit__`` calls ``wait()`` — without it an unkilled
        # child can hang the wait and leave a zombie. BaseException (not
        # Exception) is intentional so Ctrl-C / SystemExit also trigger.
        with subprocess.Popen(
            self.command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True,
        ) as process:
            try:
                for line in process.stdout:
                    print(line, end="")        # Live output to terminal
                    stdout_fh.write(line)      # Also write to stdout log
            except BaseException:
                process.kill()
                raise

        # Popen.__exit__ has called wait(); returncode is now set.
        return_code = process.returncode
        logger.info(f"Done running {self.command_as_str}")

        if return_code != 0:
            raise subprocess.CalledProcessError(return_code, self.command)

`run_tool(tool, ctx=None)` `staticmethod`

Runs the given external tool.

Parameters:

Name	Type	Description	Default
`tool`	`ExternalTool`	The external tool to run.	required
`ctx`	`Optional[click.Context]`	The click context to use. Defaults to None.	`None`

Returns:

Type	Description
`None`	None.

Raises:

Type	Description
`subprocess.CalledProcessError`	If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_tool(tool)
None

Source code in src/baktfold/utils/external_tools.py

@staticmethod
def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_tool(tool)
      None
    """
    try:
        tool.run()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

`run_tools(tools_to_run, ctx=None)` `staticmethod`

Runs a list of tools.

Parameters:

Name	Type	Description	Default
`tools_to_run`	`Tuple[ExternalTool]`	The list of tools to run.	required
`ctx`	`Optional[click.Context]`	The click context.	`None`

Examples:

>>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
>>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
>>> ExternalTool.run_tools((tool1, tool2))
>>> ExternalTool.run_tools((tool1, tool2), ctx)

Source code in src/baktfold/utils/external_tools.py

@staticmethod
def run_tools(
    tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
) -> None:
    """
    Runs a list of tools.

    Args:
      tools_to_run (Tuple[ExternalTool]): The list of tools to run.
      ctx (Optional[click.Context]): The click context.

    Examples:
      >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
      >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
      >>> ExternalTool.run_tools((tool1, tool2))
      >>> ExternalTool.run_tools((tool1, tool2), ctx)
    """
    for tool in tools_to_run:
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

`OrderedCommands`

Bases: click.Group

This class will preserve the order of subcommands, which is useful when printing --help

Source code in src/baktfold/utils/util.py

class OrderedCommands(click.Group):
    """This class will preserve the order of subcommands, which is useful when printing --help"""

    def list_commands(self, ctx: click.Context):
        """
        Returns a list of subcommands in the order they were added.

        Args:
          ctx (click.Context): The click context.

        Returns:
          list: A list of subcommands in the order they were added.
        """
        return list(self.commands)

`list_commands(ctx)`

Returns a list of subcommands in the order they were added.

Parameters:

Name	Type	Description	Default
`ctx`	`click.Context`	The click context.	required

Returns:

Name	Type	Description
`list`		A list of subcommands in the order they were added.

Source code in src/baktfold/utils/util.py

def list_commands(self, ctx: click.Context):
    """
    Returns a list of subcommands in the order they were added.

    Args:
      ctx (click.Context): The click context.

    Returns:
      list: A list of subcommands in the order they were added.
    """
    return list(self.commands)

`atomic_write_path(target)`

Yield a sibling temp path that is renamed over target on success.

On any exception (including KeyboardInterrupt), the temp is removed and target is left exactly as it was before the with-block.

Source code in src/baktfold/utils/util.py

@contextmanager
def atomic_write_path(target: Union[str, Path]) -> Iterator[Path]:
    """Yield a sibling temp path that is renamed over ``target`` on success.

    On any exception (including KeyboardInterrupt), the temp is removed and
    ``target`` is left exactly as it was before the with-block.
    """
    target = Path(target)
    target.parent.mkdir(parents=True, exist_ok=True)
    fd, tmp_name = tempfile.mkstemp(
        prefix=f".{target.name}.",
        suffix=".tmp",
        dir=str(target.parent),
    )
    os.close(fd)
    tmp_path = Path(tmp_name)
    try:
        yield tmp_path
    except BaseException:
        try:
            tmp_path.unlink()
        except FileNotFoundError:
            pass
        raise
    else:
        os.replace(tmp_path, target)

`baktfold_base(rel_path)`

Returns the absolute path to the given relative path.

Parameters:

Name	Type	Description	Default
`rel_path`	`str`	The relative path to the file.	required

Returns:

Name	Type	Description
`str`		The absolute path to the file.

Source code in src/baktfold/utils/util.py

def baktfold_base(rel_path):
    """
    Returns the absolute path to the given relative path.

    Args:
      rel_path (str): The relative path to the file.

    Returns:
      str: The absolute path to the file.
    """
    return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)

`begin_baktfold(params, subcommand, no_log=False)`

Begin baktfold process.

Parameters:

Name	Type	Description	Default
`params`	`Dict[str, Any]`	A dictionary of parameters for baktfold.	required
`subcommand`	`str`	Subcommand indicating the baktfold operation.	required
`no_log`	`bool`	No log file	`False`

Returns:

Name	Type	Description
`int`	`int`	Start time of the baktfold process.

Source code in src/baktfold/utils/util.py

def begin_baktfold(params: Dict[str, Any], subcommand: str, no_log: bool = False) -> int:
    """
    Begin baktfold process.

    Parameters:
        params (Dict[str, Any]): A dictionary of parameters for baktfold.
        subcommand (str): Subcommand indicating the baktfold operation.
        no_log (bool): No log file

    Returns:
        int: Start time of the baktfold process.
    """
    # Tear down any sinks from a prior call before installing fresh ones.
    _remove_baktfold_sinks()

    # get start time
    start_time = time.time()

    cfg.run_start = datetime.now()

    # initial logging stuff — track ids so they can be removed in end_baktfold.
    if not no_log:
        log_file = os.path.join(params["--output"], f"baktfold_{subcommand}_{start_time}.log")
        _BAKTFOLD_SINK_IDS.append(logger.add(log_file))
    _BAKTFOLD_SINK_IDS.append(logger.add(lambda _: sys.exit(1), level="ERROR"))

    print_splash()
    logger.info("baktfold: rapid & standardized annotation of bacterial genomes, MAGs & plasmids using protein structural information")

    logger.info(f"You are using baktfold version {get_version()}")
    logger.info("Repository homepage is https://github.com/gbouras13/baktfold")
    logger.info(f"You are running baktfold {subcommand}")
    logger.info(f"Listing parameters")
    for key, value in params.items():
        logger.info(f"Parameter: {key} {value}")

    return start_time

`clean_up_temporary_files(output, prefix)`

Clean up temporary files generated during the baktfold process.

Parameters:

Name	Type	Description	Default
`output`	`Path`	Path to the output directory.	required
`prefix`	`str`	prefix str	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/util.py

def clean_up_temporary_files(output: Path, prefix: str) -> None:
    """
    Clean up temporary files generated during the baktfold process.

    Parameters:
        output (Path): Path to the output directory.
        prefix (str): prefix str


    Returns:
        None
    """

    baktfold_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    result_tsv_swissprot: Path = Path(output) / "foldseek_results_swissprot.tsv"
    result_tsv_afdb: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"
    result_tsv_pdb: Path = Path(output) / "foldseek_results_pdb.tsv"
    result_tsv_cath: Path = Path(output) / "foldseek_results_cath.tsv"
    result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"
    foldseek_db: Path = Path(output) / "foldseek_db"
    result_db_base: Path = Path(output) / "result_db"
    temp_db: Path = Path(output) / "temp_db"

    remove_directory(result_db_base)
    remove_directory(temp_db)
    remove_directory(foldseek_db)

    remove_file(baktfold_aa)
    remove_file(result_tsv_swissprot)
    remove_file(result_tsv_afdb)
    remove_file(result_tsv_pdb)
    remove_file(result_tsv_custom)
    remove_file(result_tsv_cath)

`echo_click(msg, log=None)`

Prints a message to stdout and optionally to a log file.

Parameters:

Name	Type	Description	Default
`msg`	`str`	The message to print.	required
`log`	`str`	The path to the log file.	`None`

Returns:

Type	Description
	None

Source code in src/baktfold/utils/util.py

def echo_click(msg, log=None):
    """
    Prints a message to stdout and optionally to a log file.

    Args:
      msg (str): The message to print.
      log (str): The path to the log file.

    Returns:
      None
    """
    click.echo(msg, nl=False, err=True)
    if log:
        with open(log, "a") as lo:
            lo.write(msg)

`end_baktfold(start_time, subcommand)`

Finish baktfold process and log elapsed time.

Parameters:

Name	Type	Description	Default
`start_time`	`float`	Start time of the process.	required
`subcommand`	`str`	Subcommand name indicating the baktfold operation.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/util.py

def end_baktfold(start_time: float, subcommand: str) -> None:
    """
    Finish baktfold process and log elapsed time.

    Parameters:
        start_time (float): Start time of the process.
        subcommand (str): Subcommand name indicating the baktfold operation.

    Returns:
        None
    """

    # Determine elapsed time
    elapsed_time = time.time() - start_time
    elapsed_time = round(elapsed_time, 2)

    cfg.run_end = datetime.now()
    run_duration = (cfg.run_end - cfg.run_start).total_seconds()
    # logger.info(f'If you use these results please cite Baktfold: https://doi.org/{bc.BAKTA_DOI}')
    logger.info(f'If you use these results please cite Baktfold: https://github.com/gbouras13/baktfold')
    logger.info(f'baktfold {subcommand} successfully finished in {int(run_duration / 60):02}:{int(run_duration % 60):02} [mm:ss].')


    # Show elapsed time for the process
    logger.info(f"baktfold {subcommand} has finished")
    logger.info("Elapsed time: " + str(elapsed_time) + " seconds")

    # Clean up sinks so a subsequent call (or test) starts with a clean logger.
    _remove_baktfold_sinks()

`get_type_rank(f)`

ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA dynamically adjusts if 5'UTR and 3'UTR is present

Source code in src/baktfold/utils/util.py

def get_type_rank(f):
    """
    ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA
    dynamically adjusts if 5'UTR and 3'UTR is present
    """
    t = f['type']
    strand = f.get('strand', '+')  # default to + if missing

    # fixed ranks
    base_order = {
        'gene': 0,
        'mRNA': 1,
        'cds': 3,
        'tRNA': 6
    }

    # dynamic UTR ordering
    if t == bc.FEATURE_5UTR:
        return 2 if strand == '+' else 4
    if t == bc.FEATURE_3UTR:
        return 4 if strand == '+' else 2

    return base_order.get(t, 99)   # non-protein features become 99

`get_version()`

Returns the version number from the VERSION file.

Returns:

Name	Type	Description
`str`		The version number.

Source code in src/baktfold/utils/util.py

def get_version():
    """
    Returns the version number from the VERSION file.

    Returns:
      str: The version number.
    """
    with open(baktfold_base("VERSION"), "r") as f:
        version = f.readline()
    return version

`print_citation()`

Prints the contents of the CITATION file to stdout.

Returns:

Type	Description
	None

Source code in src/baktfold/utils/util.py

def print_citation():
    """
    Prints the contents of the CITATION file to stdout.

    Returns:
      None
    """
    with open(baktfold_base("CITATION"), "r") as f:
        for line in f:
            echo_click(line)

`print_splash()`

Prints the splash screen to stdout.

Returns:

Type	Description
	None

Source code in src/baktfold/utils/util.py

def print_splash():
    """
    Prints the splash screen to stdout.

    Returns:
      None
    """
    click.echo(
        """\b

  _           _    _    __      _     _ 
 | |         | |  | |  / _|    | |   | |
 | |__   __ _| | _| |_| |_ ___ | | __| |
 | '_ \ / _` | |/ / __|  _/ _ \| |/ _` |
 | |_) | (_| |   <| |_| || (_) | | (_| |
 |_.__/ \__,_|_|\_\\__|_| \___/|_|\__,_|


"""
    )

`remove_directory(dir_path)`

Remove a directory and all its contents if it exists.

Parameters:

Name	Type	Description	Default
`dir_path`	`Path`	Path to the directory to remove.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/util.py

def remove_directory(dir_path: Path) -> None:
    """
    Remove a directory and all its contents if it exists.

    Parameters:
        dir_path (Path): Path to the directory to remove.

    Returns:
        None
    """
    if dir_path.exists():
        shutil.rmtree(dir_path, ignore_errors=True)

`remove_file(file_path)`

Remove a file if it exists.

Parameters:

Name	Type	Description	Default
`file_path`	`Path`	Path to the file to remove.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/util.py

def remove_file(file_path: Path) -> None:
    """
    Remove a file if it exists.

    Parameters:
        file_path (Path): Path to the file to remove.

    Returns:
        None
    """
    if file_path.exists():
        file_path.unlink()  # Use unlink to remove the file

`replace_pipe_in_fasta(input_path)`

Replace '~PIPE~' with '|' in FASTA headers, writing atomically.

Streams line-by-line to a sibling temp file and renames it onto input_path on success. A kill mid-write leaves the original intact.

Source code in src/baktfold/utils/util.py

def replace_pipe_in_fasta(input_path):
    """Replace '~PIPE~' with '|' in FASTA headers, writing atomically.

    Streams line-by-line to a sibling temp file and renames it onto
    ``input_path`` on success.  A kill mid-write leaves the original intact.
    """
    with atomic_write_path(input_path) as tmp:
        with open(input_path, "r") as in_f, open(tmp, "w") as out_f:
            for line in in_f:
                if line.startswith(">") and "~PIPE~" in line:
                    line = line.replace("~PIPE~", "|")
                out_f.write(line)

`sort_euk_feature_key(f)`

Sorts a feature dictionary by start, locus, type rank, and stop.

Parameters:

Name	Type	Description	Default
`f`	`dict`	The feature dictionary.	required

Returns:

Name	Type	Description
`tuple`		A tuple of the sorted values.

Source code in src/baktfold/utils/util.py

def sort_euk_feature_key(f):
    """
    Sorts a feature dictionary by start, locus, type rank, and stop.

    Args:
      f (dict): The feature dictionary.

    Returns:
      tuple: A tuple of the sorted values.
    """
    start = f.get('start', float('inf'))
    stop = f.get('stop', float('inf'))
    locus = f.get('locus')
    type_rank = get_type_rank(f)

    if locus and type_rank != 99:
        # Within a locus → sort by type rank second and stop last (if multiple CDS e.g.)
        return (start, 0, locus, type_rank, stop)
    else:
        # Non-locus or non-gene features → sort only by start
        return (start, 1, '', 99, stop)

`touch_file(path)`

Update the access and modification times of a file to the current time, creating the file if it does not exist.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the file.	required

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/util.py

def touch_file(path: Path) -> None:
    """
    Update the access and modification times of a file to the current time, creating the file if it does not exist.

    Parameters:
        path (Path): Path to the file.

    Returns:
        None
    """
    with open(path, "a"):
        os.utime(path, None)

`check_dependencies()`

Checks the dependencies and versions of non Python programs (i.e. Foldseek)

Returns:

Type	Description
`None`	None

Source code in src/baktfold/utils/validation.py

def check_dependencies() -> None:
    """
    Checks the dependencies and versions of non Python programs (i.e. Foldseek)

    Parameters:
        None

    Returns:
        None

    """

    #############
    # foldseek
    #############
    # Previously a bare ``except`` logged the error but fell through to
    # ``process.communicate()`` on an unbound ``process`` → UnboundLocalError.
    # Bare ``except`` also swallowed Ctrl-C. Narrowed to the errors Popen
    # actually raises for a missing binary; exit cleanly on failure.
    try:
        process = sp.Popen(["foldseek", "version"], stdout=sp.PIPE, stderr=sp.STDOUT)
    except (FileNotFoundError, PermissionError, OSError) as e:
        logger.error(
            f"Foldseek not found on PATH ({type(e).__name__}: {e}). "
            "Install foldseek and ensure it is on your PATH, then re-run baktfold."
        )
        sys.exit(1)

    foldseek_out, _ = process.communicate()
    foldseek_out = foldseek_out.decode()

    foldseek_version = foldseek_out.strip()

    if "941cd33" in foldseek_version:
        foldseek_major_version = 10
        foldseek_minor_version = "941cd33"
        logger.info(
            f"Foldseek version found is v{foldseek_major_version}.{foldseek_minor_version}"
        )
        logger.info("Foldseek version is ok")
    else:
        logger.warning(f"Foldseek version found is v{foldseek_version}")
        logger.warning(f"baktfold is recommended to be run with Foldseek v10.941cd33")
        logger.warning(f"Using a different Foldseek version is likely to work without issue, but this cannot be guaranteed.")

`check_genbank_and_prokka(filepath, euk)`

Validate that an input file is a readable GenBank file and check whether it was annotated using Prokka. The function transparently supports compressed files (e.g., .gz, .bz2, .xz, .zst) via xopen.

Validation steps

• Attempts to parse the file as GenBank using Biopython. • Logs an error and returns None if no GenBank records are found. • Checks the COMMENT field of each record for a Prokka signature ("Annotated using prokka", case-insensitive). • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

Parameters

str

Path to the GenBank or compressed GenBank file.

flag

whether or not the input is eukaryotic (skips prokka)

Returns

list[SeqRecord] or None A list of Biopython SeqRecord objects if parsing succeeds. Returns None if the file is not valid GenBank or cannot be parsed.

Source code in src/baktfold/utils/validation.py

def check_genbank_and_prokka(filepath, euk):
    """
    Validate that an input file is a readable GenBank file and check whether it was
    annotated using Prokka. The function transparently supports compressed files
    (e.g., .gz, .bz2, .xz, .zst) via `xopen`.

    Validation steps:
      • Attempts to parse the file as GenBank using Biopython.
      • Logs an error and returns None if no GenBank records are found.
      • Checks the COMMENT field of each record for a Prokka signature
        ("Annotated using prokka", case-insensitive).
      • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

    Parameters
    ----------
    filepath : str
        Path to the GenBank or compressed GenBank file.
    euk: flag
        whether or not the input is eukaryotic (skips prokka)

    Returns
    -------
    list[SeqRecord] or None
        A list of Biopython SeqRecord objects if parsing succeeds.
        Returns None if the file is not valid GenBank or cannot be parsed.
    """

    is_valid_genbank = False
    is_prokka = False

    try:
        # Use xopen so gzip/bz2/xz/zst work automatically
        with xopen(filepath, "rb") as handle:
            # SeqIO.parse expects text handle -> decode
            # Use .read() is too big; instead wrap in TextIOWrapper
            import io
            text_handle = io.TextIOWrapper(handle, encoding="utf-8", errors="replace")

            records = list(SeqIO.parse(text_handle, "genbank"))

        if not records:
            logger.error(f"Input file {filepath} is not GenBank format. Please check your input")
            return None
        else:
            is_valid_genbank = True


        # Scan comments for Prokka signature
        if not euk:
            for rec in records:
                comment = rec.annotations.get("comment", "") or ""
                if "annotated using prokka" in comment.lower():
                    is_prokka = True
                    break


            if is_prokka is False:
                logger.warning(f"Input file {filepath} does not appear to come from Prokka.")
                logger.warning(f"Conversion will proceed but no guarantee of success.")

    except Exception:
        logger.error(f"There was an error parsing {filepath}. Please check your input")
        return None

    return records

`instantiate_dirs(output_dir, force)`

Checks and instantiates the output directory.

Parameters:

Name	Type	Description	Default
`output_dir`	`Union[str, Path]`	Path to the output directory.	required
`force`	`bool`	Force flag indicating whether to overwrite existing directory.	required

Returns:

Name	Type	Description
`Path`	`Path`	Final output directory path.

Source code in src/baktfold/utils/validation.py

def instantiate_dirs(output_dir: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output directory.

    Parameters:
        output_dir (Union[str, Path]): Path to the output directory.
        force (bool): Force flag indicating whether to overwrite existing directory.

    Returns:
        Path: Final output directory path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.info(f"Checking the output directory {output_dir}")
    if force is True:
        if Path(output_dir).exists():
            logger.info(f"Removing {output_dir} because --force was specified")
            shutil.rmtree(output_dir)
        else:
            logger.info(
                "--force was specified even though the output directory does not already exist. Continuing"
            )
    else:
        if Path(output_dir).exists():
            logger.error(
                "Output directory already exists and force was not specified. Please specify -f or --force to overwrite the output directory"
            )

    # instantiate outdir
    if Path(output_dir).exists() is False:
        Path(output_dir).mkdir(parents=True, exist_ok=True)

`validate_outfile(outfile, force)`

Checks and instantiates the output file for baktfold convert-prokka

Parameters:

Name	Type	Description	Default
`outfile`	`Union[str, Path]`	Path to the output file.	required
`force`	`bool`	Force flag indicating whether to overwrite existing outfile.	required

Returns:

Name	Type	Description
`Path`	`Path`	Final output file path.

Source code in src/baktfold/utils/validation.py

def validate_outfile(outfile: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output file for baktfold convert-prokka

    Parameters:
        outfile (Union[str, Path]): Path to the output file.
        force (bool): Force flag indicating whether to overwrite existing outfile.

    Returns:
        Path: Final output file path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.info(f"Checking the output file {outfile}")
    if force is True:
        if Path(outfile).exists():
            logger.info(f"Removing {outfile} because --force was specified")
            Path(outfile).unlink()
        else:
            logger.info(
                f"--force was specified even though the output file {outfile} does not already exist. Continuing"
            )
    else:
        if Path(outfile).exists():
            logger.error(
                f"Output file {outfile} already exists and force was not specified. Please specify -f or --force to overwrite the output file"
            )

`get_tophit(result_tsv, structures, cath=False)`

Process Foldseek output to extract the top hit per query.

Parameters:

Name	Type	Description	Default
`result_tsv`	`Path`	Path to the Foldseek result TSV file.	required
`structures`	`bool`	Flag indicating whether structures have been added.	required
`cath`	`bool`	Flag indicating whether this is for CATH database (all greedy besthits kept not just top)	`False`

Returns:

Type	Description
`pl.DataFrame`	pl.DataFrame: DataFrame containing the top hit(s) extracted from the Foldseek output.

Source code in src/baktfold/results/tophit.py

def get_tophit(
    result_tsv: Path,
    structures: bool,
    cath: bool = False
) -> pl.DataFrame:
    """
    Process Foldseek output to extract the top hit per query.

    Args:
        result_tsv (Path): Path to the Foldseek result TSV file.
        structures (bool): Flag indicating whether structures have been added.
        cath (bool): Flag indicating whether this is for CATH database (all greedy besthits kept not just top)

    Returns:
        pl.DataFrame: DataFrame containing the top hit(s) extracted from the Foldseek output.
    """

    logger.info("Processing Foldseek output")

    if structures:

        col_list = [
            "query",
            "target",
            "bitscore",
            "fident",
            "evalue",
            "qStart",
            "qEnd",
            "qLen",
            "tStart",
            "tEnd",
            "tLen",
            "alntmscore",
            "lddt"
        ]
    else:

        col_list = [
            "query",
            "target",
            "bitscore",
            "fident",
            "evalue",
            "qStart",
            "qEnd",
            "qLen",
            "tStart",
            "tEnd",
            "tLen",
        ]

    # infer_schema_length=None scans the whole file so dtype inference matches
    # pandas' (which read the whole column) — keeps the output byte-identical.
    try:
        foldseek_df = pl.read_csv(
            result_tsv,
            separator="\t",
            has_header=False,
            new_columns=col_list,
            infer_schema_length=None,
        )
    except pl.exceptions.NoDataError:
        # empty Foldseek result (0-byte file) — mirror pandas' empty frame
        foldseek_df = pl.DataFrame(schema={c: pl.Utf8 for c in col_list})

    # replace ~PIPE~ with |
    foldseek_df = foldseek_df.with_columns(
        pl.col("query").str.replace_all("~PIPE~", "|", literal=True)
    )

    # in case the foldseek output is empty
    if foldseek_df.is_empty():
        logger.warning(
            "Foldseek found no hits whatsoever - please check your input if you expect hits"
        )
        return foldseek_df

    # add qcov and tcov (rounded to 2dp). evalue is rendered with Python's
    # float repr so the written TSV is byte-identical to the previous pandas
    # output (pandas/Python pad scientific exponents to 2 digits, polars does
    # not — e.g. '1.5e-08' vs '1.5e-8'). repr() round-trips losslessly so the
    # numeric value consumed downstream by pstc.parse is unchanged.
    foldseek_df = foldseek_df.with_columns(
        ((pl.col("qEnd") - pl.col("qStart")) / pl.col("qLen")).round(2).alias("qCov"),
        ((pl.col("tEnd") - pl.col("tStart")) / pl.col("tLen")).round(2).alias("tCov"),
        pl.col("evalue").map_elements(lambda v: repr(float(v)), return_dtype=pl.Utf8),
    )

    # reorder: qCov directly after qLen, the tStart/tEnd/tLen/tCov block
    # together; any trailing structure columns (alntmscore, lddt) stay at the end.
    front = col_list[: col_list.index("qLen") + 1]
    tail = col_list[col_list.index("tLen") + 1 :]
    new_column_order = front + ["qCov", "tStart", "tEnd", "tLen", "tCov"] + tail
    foldseek_df = foldseek_df.select(new_column_order)

    if not cath:
        # get only the tophit - always the first (top-bitscore) hit per query.
        # maintain_order=True preserves Foldseek's descending-bitscore order so
        # "first" picks the same survivor pandas' drop_duplicates(keep="first") did.
        foldseek_df = foldseek_df.unique(subset="query", keep="first", maintain_order=True)
    # otherwise, the df will contain all greedy tophits from CATH

    return foldseek_df