Skip to content

Databases

calc_md5_sum(tarball_path, buffer_size=1024 * 1024)

Calculate the MD5 checksum of the given file.

Parameters:

Name Type Description Default
tarball_path Path

The path to the file for which the MD5 checksum needs to be calculated.

required
buffer_size int

The buffer size for reading the file.

1024 * 1024

Returns:

Name Type Description
str str

The MD5 checksum of the file.

Source code in src/baktfold/databases/db.py
def calc_md5_sum(tarball_path: Path, buffer_size: int = 1024 * 1024) -> str:
    """
    Calculate the MD5 checksum of the given file.

    Args:
        tarball_path (Path): The path to the file for which the MD5 checksum needs to be calculated.
        buffer_size (int): The buffer size for reading the file.

    Returns:
        str: The MD5 checksum of the file.
    """

    md5 = hashlib.md5()
    with tarball_path.open("rb") as fh:
        data = fh.read(buffer_size)
        while data:
            md5.update(data)
            data = fh.read(buffer_size)
    return md5.hexdigest()

check_db_installation(db_dir, foldseek_gpu)

Check if the baktfold database is installed.

Parameters:

Name Type Description Default
db_dir Path

The directory where the database is installed.

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required

Returns:

Name Type Description
bool bool

True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py
def check_db_installation(db_dir: Path, foldseek_gpu: bool) -> bool:
    """
    Check if the baktfold database is installed.

    Args:
        db_dir Path: The directory where the database is installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    downloaded_flag = True
    for file_name in BAKTFOLD_DB_NAMES:
        path = Path(db_dir) / file_name
        if not path.is_file():
            logger.warning(f"baktfold Database file {path} is missing")
            downloaded_flag = False
            break

    gpu_flag = True
    if foldseek_gpu:
        for file_name in baktfold_DB_FOLDSEEK_GPU_NAMES:
            path = Path(db_dir) / file_name
            if not path.is_file():
                logger.warning(f"baktfold Foldseek-GPU Database file {path} is missing")
                gpu_flag = False
                break 

    return downloaded_flag, gpu_flag

check_prostT5_download(model_dir, model_name)

Args: model_dir (Path): Directory where the model and tokenizer is be stored. model_name (str): Name of the pre-trained T5 model.

Returns:

Name Type Description
bool bool

bool to tell baktfold whether to download ProstT5

Source code in src/baktfold/databases/db.py
def check_prostT5_download(model_dir: Path, model_name: str) -> bool:
    """
     Args:
        model_dir (Path): Directory where the model and tokenizer is be stored.
        model_name (str): Name of the pre-trained T5 model.
    Returns:
        bool: bool to tell baktfold whether to download ProstT5
    """

    # assumes already has been downloaded
    download = False

    if model_name == "Rostlab/ProstT5_fp16":

        model_sub_dir = "models--Rostlab--ProstT5_fp16"
        DICT = PROSTT5_MD5_DICTIONARY


    for key in DICT:
        for nested_key in DICT[key]:
            file_path = Path(
                f"{model_dir}/{model_sub_dir}/{key}/{nested_key}"
            )

            # check file exists
            if file_path.exists():
                md5_sum = calc_md5_sum(file_path)
                if md5_sum != DICT[key][nested_key]:
                    logger.warning(
                        f"Corrupt model file {file_path}! MD5 should be '{DICT[key][nested_key]}' but is '{md5_sum}'"
                    )
                    download = True
            else:
                logger.warning(f"Model file {file_path} does not exist.")
                download = True

    return download

download(tarball_path, cache_dir)

Download the database from the given URL using HF.

Parameters:

Name Type Description Default
tarball_path Path

The path where the downloaded tarball should be saved.

required
Source code in src/baktfold/databases/db.py
def download(tarball_path: Path, cache_dir: Path) -> None:
    """
    Download the database from the given URL using HF.

    Args:
        tarball_path (Path): The path where the downloaded tarball should be saved.
    """

    hf_tarball_path = hf_hub_download(
        repo_id="gbouras13/baktfold-db",
        repo_type="dataset",
        filename="baktfold_db.tar.gz"  ,
        cache_dir=f"{cache_dir}"
    )
    # move from cache_dir to the base
    # need to get the actual path not symlink

    real_tarball = Path(hf_tarball_path).resolve()
    tarball_path.parent.mkdir(parents=True, exist_ok=True)

    shutil.move(real_tarball, tarball_path)

    logger.info(f"Tarball saved to {tarball_path}")

download_requests(db_url, tarball_path)

Downloads a file from a given URL using the requests library.

Parameters:

Name Type Description Default
db_url str

The URL of the file to download.

required
tarball_path Path

The path to save the downloaded file.

required

Returns:

Type Description

None

Examples:

>>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))
Source code in src/baktfold/databases/db.py
def download_requests(db_url: str, tarball_path: Path):
    """
    Downloads a file from a given URL using the requests library.

    Args:
      db_url (str): The URL of the file to download.
      tarball_path (Path): The path to save the downloaded file.

    Returns:
      None

    Examples:
      >>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))
    """

    headers = {
        "User-Agent": f"baktfold/{CURRENT_DB_VERSION} (contact: george.bouras@adelaide.edu.au)"
    }

    try:
        with tarball_path.open("wb") as fh_out, requests.get(
            db_url, stream=True, headers=headers
        ) as resp:
            total_length = resp.headers.get("content-length")
            if total_length is not None:  # content length header is set
                total_length = int(total_length)
            with alive_bar(total=total_length, scale="SI") as bar:
                for data in resp.iter_content(chunk_size=1024 * 1024):
                    fh_out.write(data)
                    bar(count=len(data))
    except:
        logger.error(
            f"ERROR: Could not download file from Zenodo! url={db_url}, path={tarball_path}"
        )

download_zenodo_prostT5(model_dir, logdir, threads)

Download the ProstT5 model from Zenodo

Parameters:

Name Type Description Default
db_url str

The URL of the database.

required
tarball_path Path

The path where the downloaded tarball should be saved.

required
Source code in src/baktfold/databases/db.py
def download_zenodo_prostT5(model_dir, logdir, threads):
    """
    Download the ProstT5 model from Zenodo

    Args:
        db_url (str): The URL of the database.
        tarball_path (Path): The path where the downloaded tarball should be saved.
    """

    db_url = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_url"]
    requiredmd5 = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_md5"]

    logger.info(f"Downloading ProstT5 model backup from {db_url}")

    tarball = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_tarball"]
    tarball_path = Path(f"{model_dir}/{tarball}")
    download_requests(db_url, tarball_path)

    md5_sum = calc_md5_sum(tarball_path)

    if md5_sum == requiredmd5:
        logger.info(f"ProstT5 model backup file download OK: {md5_sum}")
    else:
        logger.error(
            f"Error: corrupt file! MD5 should be '{requiredmd5}' but is '{md5_sum}'"
        )

    logger.info(
        f"Extracting ProstT5 model backup tarball: file={tarball_path}, output={model_dir}"
    )

    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            tar_file.extractall(path=str(model_dir))

    except OSError:
        logger.warning("Encountered OSError: {}".format(OSError))
        logger.error(f"Could not extract {tarball_path} to {model_dir}")

    tarball_path.unlink()

foldseek_makepaddedseqdb(db_dir)

Runs the Foldseek makepaddedseqdb command on a given database directory.

Parameters:

Name Type Description Default
db_dir Path

The path to the database directory.

required

Returns:

Type Description
None

None

Examples:

>>> foldseek_makepaddedseqdb(Path("baktfold_db"))
Source code in src/baktfold/databases/db.py
def foldseek_makepaddedseqdb(db_dir: Path) -> None:
    """
    Runs the Foldseek makepaddedseqdb command on a given database directory.

    Args:
      db_dir (Path): The path to the database directory.

    Returns:
      None

    Examples:
      >>> foldseek_makepaddedseqdb(Path("baktfold_db"))
    """

    dbs = ["AFDBClusters", "pdb", "cath", "swissprot"]
    logdir = Path(db_dir) / "logdir"

    for db_name in dbs:
        db_path = Path(db_dir) / db_name
        db_path_gpu = Path(db_dir) / f"{db_name}_gpu"

        foldseek_makepaddedseqdb = ExternalTool(
            tool="foldseek",
            input="",
            output="",
            params=f"makepaddedseqdb {db_path} {db_path_gpu}",
            logdir=logdir,
        )

        ExternalTool.run_tool(foldseek_makepaddedseqdb)

install_database(db_dir, foldseek_gpu, threads)

Install the baktfold database.

Parameters:

Name Type Description Default
db_dir Path

The directory where the database should be installed.

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required
threads int

Number of threads available (makes downloading faster)

required
Source code in src/baktfold/databases/db.py
def install_database(db_dir: Path, foldseek_gpu: bool, threads: int) -> None:
    """
    Install the baktfold database.

    Args:
        db_dir Path: The directory where the database should be installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db
        threads int: Number of threads available (makes downloading faster)
    """

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {db_dir}.")
    downloaded_flag, gpu_flag = check_db_installation(db_dir, foldseek_gpu)
    if downloaded_flag:
        logger.info("All baktfold databases files are present")
    else:
        logger.info("Some baktfold databases files are missing")

        DICT = VERSION_DICTIONARY
        db_url = DICT[CURRENT_DB_VERSION]["db_url"]
        logger.info(f"Downloading baktfold DB")

        requiredmd5s = DICT[CURRENT_DB_VERSION]["md5"]
        tarball = DICT[CURRENT_DB_VERSION]["tarball"]

        tarball_path = Path(f"{db_dir}/{tarball}")
        logdir = Path(db_dir) / "logdir"

        try: 
            logger.info(f"Downloading from HuggingFace")
            download(tarball_path, db_dir)
        except:
            logger.warning(
                f"Could not download file from HuggingFace: path={tarball_path}"
            )
            logger.warning(f"Trying now with requests")
            download_requests(db_url, tarball_path)


        md5_sum = calc_md5_sum(tarball_path)


        if md5_sum in requiredmd5s:
            logger.info(f"baktfold database file download OK: {md5_sum}")
        else:
            logger.error(
                f"Error: corrupt database file! MD5 should be '{requiredmd5s}' but is '{md5_sum}'"
            )

        logger.info(
            f"Extracting baktfold database tarball: file={tarball_path}, output={db_dir}"
        )
        untar(tarball_path, db_dir, DICT)
        tarball_path.unlink()

    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.info("Some baktfold database files compatible with Foldseek-GPU are missing")
            logger.info("Creating them")
            foldseek_makepaddedseqdb(db_dir)

    logger.info("Database download and processing complete")

untar(tarball_path, output_path, DICT)

Extract the tarball to the output path.

Parameters:

Name Type Description Default
tarball_path Path

The path to the tarball file.

required
output_path Path

The path where the contents of the tarball should be extracted.

required
DICT dict

version dictionary

required
Source code in src/baktfold/databases/db.py
def untar(tarball_path: Path, output_path: Path, DICT: dict) -> None:
    """
    Extract the tarball to the output path.

    Args:
        tarball_path (Path): The path to the tarball file.
        output_path (Path): The path where the contents of the tarball should be extracted.
        DICT (dict): version dictionary
    """
    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            tar_file.extractall(path=str(output_path))

        tarpath = Path(output_path) / DICT[CURRENT_DB_VERSION]["dir_name"]

        # Get a list of all files in the directory
        files_to_move = [f for f in tarpath.iterdir() if f.is_file()]

        # Move each file to the destination directory
        for file_name in files_to_move:
            destination_path = output_path / file_name.name
            shutil.move(file_name, destination_path)
        # remove the directory
        remove_directory(tarpath)

    except OSError:
        logger.warning("Encountered OSError: {}".format(OSError))
        logger.error(f"Could not extract {tarball_path} to {output_path}")

validate_db(database, default_dir, foldseek_gpu)

Validates the baktfold database is installed.

Parameters:

Name Type Description Default
database str

The directory where the database is installed.

required
default_dir str

Default DB location

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required

Returns:

Name Type Description
bool Path

True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py
def validate_db(database: str, default_dir: str, foldseek_gpu: bool) -> Path:
    """
    Validates the baktfold database is installed.

    Args:
        database str: The directory where the database is installed.
        default_dir str: Default DB location
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    # set default DB if not specified
    if database is not None:
        database: Path = Path(database)
    else:
        database = Path(default_dir)

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {database}")
    downloaded_flag, gpu_flag = check_db_installation(database, foldseek_gpu)
    if downloaded_flag == True:
        logger.info("All baktfold databases files are present")
    else:
        if database == Path(default_dir):  # default
            logger.error(
                f"baktfold database not found. Please run baktfold install to download and install the baktfold database"
            )
        else:  # specific
            logger.error(
                f"baktfold database not found. Please run baktfold install -d {database} to download and install the baktfold database"
            )
    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.error(
                f"baktfold database files compatible with Foldseek-GPU not found. Please run baktfold install -d {database} --foldseek-gpu"
            )


    return database