Skip to content

Utils

Originally taken from Michael Hall's tbpore https://github.com/mbhall88/tbpore/blob/main/tbpore/external_tools.py

Also used by a variety of other tools (Dnaapler, Plassembler, Pharokka)

ExternalTool

Class for running external tools.

Parameters:

Name Type Description Default
tool str

The path to the tool to run.

required
input str

The input file.

required
output str

The output file.

required
params str

The parameters to pass to the tool.

required
logdir Path

The directory to store log files.

required

Attributes:

Name Type Description
command List[str]

The command to run.

out_log str

The path to the stdout log file.

err_log str

The path to the stderr log file.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"
Source code in src/baktfold/utils/external_tools.py
class ExternalTool:
    """
    Class for running external tools.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    def __init__(
        self,
        tool: str,
        input: str,
        output: str,
        params: str,
        logdir: Path,
        env: Optional[Dict[str, str]] = None,
    ):
        """
        Initializes an ExternalTool object.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.
          logdir (Path): The directory to store log files.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
            for multi-GPU foldseek). None == inherit unchanged.

        Attributes:
          command (List[str]): The command to run.
          out_log (str): The path to the stdout log file.
          err_log (str): The path to the stderr log file.
          env (Optional[Dict[str, str]]): Extra subprocess env vars.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command
          ["tool", "params", "output", "input"]
          >>> tool.out_log
          "logdir/tool_1234567890abcdef1234567890abcdef.out"
          >>> tool.err_log
          "logdir/tool_1234567890abcdef1234567890abcdef.err"
        """
        logdir = Path(logdir)
        self.command: List[str] = self._build_command(tool, input, output, params)
        Path(logdir).mkdir(parents=True, exist_ok=True)
        command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
        tool_name = Path(tool).name
        logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
        self.out_log = f"{logfile_prefix}.out"
        self.err_log = f"{logfile_prefix}.err"
        self.env = env

    @property
    def command_as_str(self) -> str:
        """
        Returns the command as a string.

        Returns:
          str: The command as a string.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command_as_str
          "tool params output input"
        """
        return shlex.join(self.command)

    @staticmethod
    def _build_command(tool: str, input: str, output: str, params: str) -> List[str]:
        """
        Builds the command to run.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.

        Returns:
          List[str]: The command to run.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._build_command("tool", "input", "output", "params")
          ["tool", "params", "output", "input"]
        """
        # note: shlex.join does not allow us to shlex.split() later
        # this is explicitly a " ".join()
        command = " ".join([tool, params, output, input])
        escaped_command = shlex.split(command)
        return escaped_command

    def run(self) -> None:
        """
        Runs the tool.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            if self.env:
                print(f"Extra env: {self.env}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")
            self._run_core(
                self.command,
                stdout_fh=stdout_fh,
                stderr_fh=stderr_fh,
                env=self.env,
            )
            logger.info(f"Done running {self.command_as_str}")

    """
    stream to terminal (aria2c) so the user knows how long it is taking
    """

    def run_stream(self) -> None:
        """
        Runs the tool and streams the output to the terminal.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run_stream()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")

            process = subprocess.Popen(
                self.command,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                bufsize=1,
                universal_newlines=True,
            )

            for line in process.stdout:
                print(line, end="")         # Live output to terminal
                stdout_fh.write(line)       # Also write to stdout log

            process.stdout.close()
            return_code = process.wait()

            logger.info(f"Done running {self.command_as_str}")

            if return_code != 0:
                raise subprocess.CalledProcessError(return_code, self.command)


    @staticmethod
    def _run_core(
        command: List[str],
        stdout_fh,
        stderr_fh,
        env: Optional[Dict[str, str]] = None,
    ) -> None:
        """
        Runs the tool.

        Args:
          command (List[str]): The command to run.
          stdout_fh: The file handle to write stdout to.
          stderr_fh: The file handle to write stderr to.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess. None == inherit.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._run_core(["tool", "params", "output", "input"], stdout_fh, stderr_fh)
        """
        merged_env = {**os.environ, **env} if env else None
        subprocess.check_call(
            command, stdout=stdout_fh, stderr=stderr_fh, env=merged_env
        )

    @staticmethod
    def run_tools(
        tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
    ) -> None:
        """
        Runs a list of tools.

        Args:
          tools_to_run (Tuple[ExternalTool]): The list of tools to run.
          ctx (Optional[click.Context]): The click context.

        Examples:
          >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
          >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
          >>> ExternalTool.run_tools((tool1, tool2))
          >>> ExternalTool.run_tools((tool1, tool2), ctx)
        """
        for tool in tools_to_run:
            try:
                tool.run()
            except subprocess.CalledProcessError as error:
                logger.error(
                    f"Error calling {tool.command_as_str} (return code {error.returncode})"
                )
                logger.error(f"Please check stdout log file: {tool.out_log}")
                logger.error(f"Please check stderr log file: {tool.err_log}")
                logger.error("Temporary files are preserved for debugging")
                logger.error("Exiting...")

                if ctx:
                    ctx.exit(1)
                else:
                    sys.exit(1)

    """
    Only one toolf
    """

    @staticmethod
    def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_tool(tool)
          None
        """
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)


    """
    Only download - so can print the aria2c output to screen
    """

    @staticmethod
    def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool and prints the aria2c output to the screen.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_download(tool)
          None
        """
        try:
            tool.run_stream()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

command_as_str: str property

Returns the command as a string.

Returns:

Name Type Description
str str

The command as a string.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command_as_str
"tool params output input"

__init__(tool, input, output, params, logdir, env=None)

Initializes an ExternalTool object.

Parameters:

Name Type Description Default
tool str

The path to the tool to run.

required
input str

The input file.

required
output str

The output file.

required
params str

The parameters to pass to the tool.

required
logdir Path

The directory to store log files.

required
env Optional[Dict[str, str]]

Extra env vars merged with os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES for multi-GPU foldseek). None == inherit unchanged.

None

Attributes:

Name Type Description
command List[str]

The command to run.

out_log str

The path to the stdout log file.

err_log str

The path to the stderr log file.

env Optional[Dict[str, str]]

Extra subprocess env vars.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"
Source code in src/baktfold/utils/external_tools.py
def __init__(
    self,
    tool: str,
    input: str,
    output: str,
    params: str,
    logdir: Path,
    env: Optional[Dict[str, str]] = None,
):
    """
    Initializes an ExternalTool object.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.
      env (Optional[Dict[str, str]]): Extra env vars merged with
        os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
        for multi-GPU foldseek). None == inherit unchanged.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.
      env (Optional[Dict[str, str]]): Extra subprocess env vars.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    logdir = Path(logdir)
    self.command: List[str] = self._build_command(tool, input, output, params)
    Path(logdir).mkdir(parents=True, exist_ok=True)
    command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
    tool_name = Path(tool).name
    logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
    self.out_log = f"{logfile_prefix}.out"
    self.err_log = f"{logfile_prefix}.err"
    self.env = env

run()

Runs the tool.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run()
Source code in src/baktfold/utils/external_tools.py
def run(self) -> None:
    """
    Runs the tool.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        if self.env:
            print(f"Extra env: {self.env}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")
        self._run_core(
            self.command,
            stdout_fh=stdout_fh,
            stderr_fh=stderr_fh,
            env=self.env,
        )
        logger.info(f"Done running {self.command_as_str}")

run_download(tool, ctx=None) staticmethod

Runs the given external tool and prints the aria2c output to the screen.

Parameters:

Name Type Description Default
tool ExternalTool

The external tool to run.

required
ctx Optional[click.Context]

The click context to use. Defaults to None.

None

Returns:

Type Description
None

None.

Raises:

Type Description
subprocess.CalledProcessError

If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_download(tool)
None
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool and prints the aria2c output to the screen.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_download(tool)
      None
    """
    try:
        tool.run_stream()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

run_stream()

Runs the tool and streams the output to the terminal.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run_stream()
Source code in src/baktfold/utils/external_tools.py
def run_stream(self) -> None:
    """
    Runs the tool and streams the output to the terminal.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run_stream()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")

        process = subprocess.Popen(
            self.command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True,
        )

        for line in process.stdout:
            print(line, end="")         # Live output to terminal
            stdout_fh.write(line)       # Also write to stdout log

        process.stdout.close()
        return_code = process.wait()

        logger.info(f"Done running {self.command_as_str}")

        if return_code != 0:
            raise subprocess.CalledProcessError(return_code, self.command)

run_tool(tool, ctx=None) staticmethod

Runs the given external tool.

Parameters:

Name Type Description Default
tool ExternalTool

The external tool to run.

required
ctx Optional[click.Context]

The click context to use. Defaults to None.

None

Returns:

Type Description
None

None.

Raises:

Type Description
subprocess.CalledProcessError

If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_tool(tool)
None
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_tool(tool)
      None
    """
    try:
        tool.run()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

run_tools(tools_to_run, ctx=None) staticmethod

Runs a list of tools.

Parameters:

Name Type Description Default
tools_to_run Tuple[ExternalTool]

The list of tools to run.

required
ctx Optional[click.Context]

The click context.

None

Examples:

>>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
>>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
>>> ExternalTool.run_tools((tool1, tool2))
>>> ExternalTool.run_tools((tool1, tool2), ctx)
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_tools(
    tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
) -> None:
    """
    Runs a list of tools.

    Args:
      tools_to_run (Tuple[ExternalTool]): The list of tools to run.
      ctx (Optional[click.Context]): The click context.

    Examples:
      >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
      >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
      >>> ExternalTool.run_tools((tool1, tool2))
      >>> ExternalTool.run_tools((tool1, tool2), ctx)
    """
    for tool in tools_to_run:
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

log_fmt = '[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] <level>{level: <8}</level> | <level>{message}</level>' module-attribute

begin and end functions

OrderedCommands

Bases: click.Group

This class will preserve the order of subcommands, which is useful when printing --help

Source code in src/baktfold/utils/util.py
class OrderedCommands(click.Group):
    """This class will preserve the order of subcommands, which is useful when printing --help"""

    def list_commands(self, ctx: click.Context):
        """
        Returns a list of subcommands in the order they were added.

        Args:
          ctx (click.Context): The click context.

        Returns:
          list: A list of subcommands in the order they were added.
        """
        return list(self.commands)

list_commands(ctx)

Returns a list of subcommands in the order they were added.

Parameters:

Name Type Description Default
ctx click.Context

The click context.

required

Returns:

Name Type Description
list

A list of subcommands in the order they were added.

Source code in src/baktfold/utils/util.py
def list_commands(self, ctx: click.Context):
    """
    Returns a list of subcommands in the order they were added.

    Args:
      ctx (click.Context): The click context.

    Returns:
      list: A list of subcommands in the order they were added.
    """
    return list(self.commands)

baktfold_base(rel_path)

Returns the absolute path to the given relative path.

Parameters:

Name Type Description Default
rel_path str

The relative path to the file.

required

Returns:

Name Type Description
str

The absolute path to the file.

Source code in src/baktfold/utils/util.py
def baktfold_base(rel_path):
    """
    Returns the absolute path to the given relative path.

    Args:
      rel_path (str): The relative path to the file.

    Returns:
      str: The absolute path to the file.
    """
    return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)

begin_baktfold(params, subcommand, no_log=False)

Begin baktfold process.

Parameters:

Name Type Description Default
params Dict[str, Any]

A dictionary of parameters for baktfold.

required
subcommand str

Subcommand indicating the baktfold operation.

required
no_log bool

No log file

False

Returns:

Name Type Description
int int

Start time of the baktfold process.

Source code in src/baktfold/utils/util.py
def begin_baktfold(params: Dict[str, Any], subcommand: str, no_log: bool = False) -> int:
    """
    Begin baktfold process.

    Parameters:
        params (Dict[str, Any]): A dictionary of parameters for baktfold.
        subcommand (str): Subcommand indicating the baktfold operation.
        no_log (bool): No log file

    Returns:
        int: Start time of the baktfold process.
    """
    # get start time
    start_time = time.time()

    cfg.run_start = datetime.now()

    # initial logging stuff
    if not no_log:
        log_file = os.path.join(params["--output"], f"baktfold_{subcommand}_{start_time}.log")
        # adds log file
        logger.add(log_file)
    logger.add(lambda _: sys.exit(1), level="ERROR")

    print_splash()
    logger.info("baktfold: rapid & standardized annotation of bacterial genomes, MAGs & plasmids using protein structural information")

    logger.info(f"You are using baktfold version {get_version()}")
    logger.info("Repository homepage is https://github.com/gbouras13/baktfold")
    logger.info(f"You are running baktfold {subcommand}")
    logger.info(f"Listing parameters")
    for key, value in params.items():
        logger.info(f"Parameter: {key} {value}")

    return start_time

clean_up_temporary_files(output, prefix)

Clean up temporary files generated during the baktfold process.

Parameters:

Name Type Description Default
output Path

Path to the output directory.

required
prefix str

prefix str

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def clean_up_temporary_files(output: Path, prefix: str) -> None:
    """
    Clean up temporary files generated during the baktfold process.

    Parameters:
        output (Path): Path to the output directory.
        prefix (str): prefix str


    Returns:
        None
    """

    baktfold_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    result_tsv_swissprot: Path = Path(output) / "foldseek_results_swissprot.tsv"
    result_tsv_afdb: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"
    result_tsv_pdb: Path = Path(output) / "foldseek_results_pdb.tsv"
    result_tsv_cath: Path = Path(output) / "foldseek_results_cath.tsv"
    result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"
    foldseek_db: Path = Path(output) / "foldseek_db"
    result_db_base: Path = Path(output) / "result_db"
    temp_db: Path = Path(output) / "temp_db"

    remove_directory(result_db_base)
    remove_directory(temp_db)
    remove_directory(foldseek_db)

    remove_file(baktfold_aa)
    remove_file(result_tsv_swissprot)
    remove_file(result_tsv_afdb)
    remove_file(result_tsv_pdb)
    remove_file(result_tsv_custom)
    remove_file(result_tsv_cath)

echo_click(msg, log=None)

Prints a message to stdout and optionally to a log file.

Parameters:

Name Type Description Default
msg str

The message to print.

required
log str

The path to the log file.

None

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def echo_click(msg, log=None):
    """
    Prints a message to stdout and optionally to a log file.

    Args:
      msg (str): The message to print.
      log (str): The path to the log file.

    Returns:
      None
    """
    click.echo(msg, nl=False, err=True)
    if log:
        with open(log, "a") as lo:
            lo.write(msg)

end_baktfold(start_time, subcommand)

Finish baktfold process and log elapsed time.

Parameters:

Name Type Description Default
start_time float

Start time of the process.

required
subcommand str

Subcommand name indicating the baktfold operation.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def end_baktfold(start_time: float, subcommand: str) -> None:
    """
    Finish baktfold process and log elapsed time.

    Parameters:
        start_time (float): Start time of the process.
        subcommand (str): Subcommand name indicating the baktfold operation.

    Returns:
        None
    """

    # Determine elapsed time
    elapsed_time = time.time() - start_time
    elapsed_time = round(elapsed_time, 2)

    cfg.run_end = datetime.now()
    run_duration = (cfg.run_end - cfg.run_start).total_seconds()
    # logger.info(f'If you use these results please cite Baktfold: https://doi.org/{bc.BAKTA_DOI}')
    logger.info(f'If you use these results please cite Baktfold: https://github.com/gbouras13/baktfold')
    logger.info(f'baktfold {subcommand} successfully finished in {int(run_duration / 60):02}:{int(run_duration % 60):02} [mm:ss].')


    # Show elapsed time for the process
    logger.info(f"baktfold {subcommand} has finished")
    logger.info("Elapsed time: " + str(elapsed_time) + " seconds")

get_type_rank(f)

ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA dynamically adjusts if 5'UTR and 3'UTR is present

Source code in src/baktfold/utils/util.py
def get_type_rank(f):
    """
    ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA
    dynamically adjusts if 5'UTR and 3'UTR is present
    """
    t = f['type']
    strand = f.get('strand', '+')  # default to + if missing

    # fixed ranks
    base_order = {
        'gene': 0,
        'mRNA': 1,
        'cds': 3,
        'tRNA': 6
    }

    # dynamic UTR ordering
    if t == bc.FEATURE_5UTR:
        return 2 if strand == '+' else 4
    if t == bc.FEATURE_3UTR:
        return 4 if strand == '+' else 2

    return base_order.get(t, 99)   # non-protein features become 99

get_version()

Returns the version number from the VERSION file.

Returns:

Name Type Description
str

The version number.

Source code in src/baktfold/utils/util.py
def get_version():
    """
    Returns the version number from the VERSION file.

    Returns:
      str: The version number.
    """
    with open(baktfold_base("VERSION"), "r") as f:
        version = f.readline()
    return version

print_citation()

Prints the contents of the CITATION file to stdout.

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def print_citation():
    """
    Prints the contents of the CITATION file to stdout.

    Returns:
      None
    """
    with open(baktfold_base("CITATION"), "r") as f:
        for line in f:
            echo_click(line)

print_splash()

Prints the splash screen to stdout.

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def print_splash():
    """
    Prints the splash screen to stdout.

    Returns:
      None
    """
    click.echo(
        """\b

  _           _    _    __      _     _ 
 | |         | |  | |  / _|    | |   | |
 | |__   __ _| | _| |_| |_ ___ | | __| |
 | '_ \ / _` | |/ / __|  _/ _ \| |/ _` |
 | |_) | (_| |   <| |_| || (_) | | (_| |
 |_.__/ \__,_|_|\_\\__|_| \___/|_|\__,_|


"""
    )

remove_directory(dir_path)

Remove a directory and all its contents if it exists.

Parameters:

Name Type Description Default
dir_path Path

Path to the directory to remove.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def remove_directory(dir_path: Path) -> None:
    """
    Remove a directory and all its contents if it exists.

    Parameters:
        dir_path (Path): Path to the directory to remove.

    Returns:
        None
    """
    if dir_path.exists():
        shutil.rmtree(dir_path)

remove_file(file_path)

Remove a file if it exists.

Parameters:

Name Type Description Default
file_path Path

Path to the file to remove.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def remove_file(file_path: Path) -> None:
    """
    Remove a file if it exists.

    Parameters:
        file_path (Path): Path to the file to remove.

    Returns:
        None
    """
    if file_path.exists():
        file_path.unlink()  # Use unlink to remove the file

replace_pipe_in_fasta(input_path)

Reads a FASTA with Biopython, replace '~PIPE~' with '|' in headers, and write the result.

Source code in src/baktfold/utils/util.py
def replace_pipe_in_fasta(input_path):
    """
    Reads a FASTA with Biopython, replace '~PIPE~' with '|' in headers, and write the result.
    """
    records = []
    for record in SeqIO.parse(input_path, "fasta"):
        record.id = record.id.replace("~PIPE~", "|")
        record.description = record.description.replace("~PIPE~", "|")
        records.append(record)

    # overwrites
    SeqIO.write(records, input_path, "fasta")

sort_euk_feature_key(f)

Sorts a feature dictionary by start, locus, type rank, and stop.

Parameters:

Name Type Description Default
f dict

The feature dictionary.

required

Returns:

Name Type Description
tuple

A tuple of the sorted values.

Source code in src/baktfold/utils/util.py
def sort_euk_feature_key(f):
    """
    Sorts a feature dictionary by start, locus, type rank, and stop.

    Args:
      f (dict): The feature dictionary.

    Returns:
      tuple: A tuple of the sorted values.
    """
    start = f.get('start', float('inf'))
    stop = f.get('stop', float('inf'))
    locus = f.get('locus')
    type_rank = get_type_rank(f)

    if locus and type_rank != 99:
        # Within a locus → sort by type rank second and stop last (if multiple CDS e.g.)
        return (start, 0, locus, type_rank, stop)
    else:
        # Non-locus or non-gene features → sort only by start
        return (start, 1, '', 99, stop)

touch_file(path)

Update the access and modification times of a file to the current time, creating the file if it does not exist.

Parameters:

Name Type Description Default
path Path

Path to the file.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def touch_file(path: Path) -> None:
    """
    Update the access and modification times of a file to the current time, creating the file if it does not exist.

    Parameters:
        path (Path): Path to the file.

    Returns:
        None
    """
    with open(path, "a"):
        os.utime(path, None)

check_dependencies()

Checks the dependencies and versions of non Python programs (i.e. Foldseek)

Returns:

Type Description
None

None

Source code in src/baktfold/utils/validation.py
def check_dependencies() -> None:
    """
    Checks the dependencies and versions of non Python programs (i.e. Foldseek)

    Parameters:
        None

    Returns:
        None

    """

    #############
    # foldseek
    #############
    try:
        process = sp.Popen(["foldseek", "version"], stdout=sp.PIPE, stderr=sp.STDOUT)
    except:
        logger.error("Foldseek not found. Please reinstall baktfold.")

    foldseek_out, _ = process.communicate()
    foldseek_out = foldseek_out.decode()

    foldseek_version = foldseek_out.strip()

    if "941cd33" in foldseek_version:
        foldseek_major_version=10
        foldseek_minor_version="941cd33"
        logger.info(
        f"Foldseek version found is v{foldseek_major_version}.{foldseek_minor_version}"
    )
    else:
        logger.warning(f"Foldseek version found is v{foldseek_version}")
        logger.warning(f"baktfold is recommended to be run with Foldseek v10.941cd33")
        logger.warning(f"Using a different Foldseek version is likely to work without issue, but this cannot be guaranteed.")


    logger.info("Foldseek version is ok")

check_genbank_and_prokka(filepath, euk)

Validate that an input file is a readable GenBank file and check whether it was annotated using Prokka. The function transparently supports compressed files (e.g., .gz, .bz2, .xz, .zst) via xopen.

Validation steps

• Attempts to parse the file as GenBank using Biopython. • Logs an error and returns None if no GenBank records are found. • Checks the COMMENT field of each record for a Prokka signature ("Annotated using prokka", case-insensitive). • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

Parameters

str

Path to the GenBank or compressed GenBank file.

flag

whether or not the input is eukaryotic (skips prokka)

Returns

list[SeqRecord] or None A list of Biopython SeqRecord objects if parsing succeeds. Returns None if the file is not valid GenBank or cannot be parsed.

Source code in src/baktfold/utils/validation.py
def check_genbank_and_prokka(filepath, euk):
    """
    Validate that an input file is a readable GenBank file and check whether it was
    annotated using Prokka. The function transparently supports compressed files
    (e.g., .gz, .bz2, .xz, .zst) via `xopen`.

    Validation steps:
      • Attempts to parse the file as GenBank using Biopython.
      • Logs an error and returns None if no GenBank records are found.
      • Checks the COMMENT field of each record for a Prokka signature
        ("Annotated using prokka", case-insensitive).
      • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

    Parameters
    ----------
    filepath : str
        Path to the GenBank or compressed GenBank file.
    euk: flag
        whether or not the input is eukaryotic (skips prokka)

    Returns
    -------
    list[SeqRecord] or None
        A list of Biopython SeqRecord objects if parsing succeeds.
        Returns None if the file is not valid GenBank or cannot be parsed.
    """

    logger.add(lambda _: sys.exit(1), level="ERROR")

    is_valid_genbank = False
    is_prokka = False

    try:
        # Use xopen so gzip/bz2/xz/zst work automatically
        with xopen(filepath, "rb") as handle:
            # SeqIO.parse expects text handle -> decode
            # Use .read() is too big; instead wrap in TextIOWrapper
            import io
            text_handle = io.TextIOWrapper(handle, encoding="utf-8", errors="replace")

            records = list(SeqIO.parse(text_handle, "genbank"))

        if not records:
            logger.error(f"Input file {filepath} is not GenBank format. Please check your input")
            return None
        else:
            is_valid_genbank = True


        # Scan comments for Prokka signature
        if not euk:
            for rec in records:
                comment = rec.annotations.get("comment", "") or ""
                if "annotated using prokka" in comment.lower():
                    is_prokka = True
                    break


            if is_prokka is False:
                logger.warning(f"Input file {filepath} does not appear to come from Prokka.")
                logger.warning(f"Conversion will proceed but no guarantee of success.")

    except Exception:
        logger.error(f"There was an error parsing {filepath}. Please check your input")
        return None

    return records

instantiate_dirs(output_dir, force)

Checks and instantiates the output directory.

Parameters:

Name Type Description Default
output_dir Union[str, Path]

Path to the output directory.

required
force bool

Force flag indicating whether to overwrite existing directory.

required

Returns:

Name Type Description
Path Path

Final output directory path.

Source code in src/baktfold/utils/validation.py
def instantiate_dirs(output_dir: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output directory.

    Parameters:
        output_dir (Union[str, Path]): Path to the output directory.
        force (bool): Force flag indicating whether to overwrite existing directory.

    Returns:
        Path: Final output directory path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.add(lambda _: sys.exit(1), level="ERROR")
    logger.info(f"Checking the output directory {output_dir}")
    if force is True:
        if Path(output_dir).exists():
            logger.info(f"Removing {output_dir} because --force was specified")
            shutil.rmtree(output_dir)
        else:
            logger.info(
                "--force was specified even though the output directory does not already exist. Continuing"
            )
    else:
        if Path(output_dir).exists():
            logger.error(
                "Output directory already exists and force was not specified. Please specify -f or --force to overwrite the output directory"
            )

    # instantiate outdir
    if Path(output_dir).exists() is False:
        Path(output_dir).mkdir(parents=True, exist_ok=True)

validate_outfile(outfile, force)

Checks and instantiates the output file for baktfold convert-prokka

Parameters:

Name Type Description Default
outfile Union[str, Path]

Path to the output file.

required
force bool

Force flag indicating whether to overwrite existing outfile.

required

Returns:

Name Type Description
Path Path

Final output file path.

Source code in src/baktfold/utils/validation.py
def validate_outfile(outfile: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output file for baktfold convert-prokka

    Parameters:
        outfile (Union[str, Path]): Path to the output file.
        force (bool): Force flag indicating whether to overwrite existing outfile.

    Returns:
        Path: Final output file path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.add(lambda _: sys.exit(1), level="ERROR")
    logger.info(f"Checking the output file {outfile}")
    if force is True:
        if Path(outfile).exists():
            logger.info(f"Removing {outfile} because --force was specified")
            Path(outfile).unlink()
        else:
            logger.info(
                f"--force was specified even though the output file {outfile} does not already exist. Continuing"
            )
    else:
        if Path(outfile).exists():
            logger.error(
                f"Output file {outfile} already exists and force was not specified. Please specify -f or --force to overwrite the output file"
            )