Skip to content

Index

calc_md5_sum(tarball_path, buffer_size=1024 * 1024)

Calculate the MD5 checksum of the given file.

Parameters:

Name Type Description Default
tarball_path Path

The path to the file for which the MD5 checksum needs to be calculated.

required
buffer_size int

The buffer size for reading the file.

1024 * 1024

Returns:

Name Type Description
str str

The MD5 checksum of the file.

Source code in src/baktfold/databases/db.py
def calc_md5_sum(tarball_path: Path, buffer_size: int = 1024 * 1024) -> str:
    """
    Calculate the MD5 checksum of the given file.

    Args:
        tarball_path (Path): The path to the file for which the MD5 checksum needs to be calculated.
        buffer_size (int): The buffer size for reading the file.

    Returns:
        str: The MD5 checksum of the file.
    """

    md5 = hashlib.md5()
    with tarball_path.open("rb") as fh:
        data = fh.read(buffer_size)
        while data:
            md5.update(data)
            data = fh.read(buffer_size)
    return md5.hexdigest()

check_db_installation(db_dir, foldseek_gpu)

Check if the baktfold database is installed.

Parameters:

Name Type Description Default
db_dir Path

The directory where the database is installed.

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required

Returns:

Name Type Description
bool bool

True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py
def check_db_installation(db_dir: Path, foldseek_gpu: bool) -> bool:
    """
    Check if the baktfold database is installed.

    Args:
        db_dir Path: The directory where the database is installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    downloaded_flag = True
    for file_name in BAKTFOLD_DB_NAMES:
        path = Path(db_dir) / file_name
        if not path.is_file():
            logger.warning(f"baktfold Database file {path} is missing")
            downloaded_flag = False
            break

    gpu_flag = True
    if foldseek_gpu:
        for file_name in baktfold_DB_FOLDSEEK_GPU_NAMES:
            path = Path(db_dir) / file_name
            if not path.is_file():
                logger.warning(f"baktfold Foldseek-GPU Database file {path} is missing")
                gpu_flag = False
                break 

    return downloaded_flag, gpu_flag

check_prostT5_download(model_dir, model_name)

Args: model_dir (Path): Directory where the model and tokenizer is be stored. model_name (str): Name of the pre-trained T5 model.

Returns:

Name Type Description
bool bool

bool to tell baktfold whether to download ProstT5

Source code in src/baktfold/databases/db.py
def check_prostT5_download(model_dir: Path, model_name: str) -> bool:
    """
     Args:
        model_dir (Path): Directory where the model and tokenizer is be stored.
        model_name (str): Name of the pre-trained T5 model.
    Returns:
        bool: bool to tell baktfold whether to download ProstT5
    """

    # assumes already has been downloaded
    download = False

    if model_name == "Rostlab/ProstT5_fp16":

        model_sub_dir = "models--Rostlab--ProstT5_fp16"
        DICT = PROSTT5_MD5_DICTIONARY


    for key in DICT:
        for nested_key in DICT[key]:
            file_path = Path(
                f"{model_dir}/{model_sub_dir}/{key}/{nested_key}"
            )

            # check file exists
            if file_path.exists():
                md5_sum = calc_md5_sum(file_path)
                if md5_sum != DICT[key][nested_key]:
                    logger.warning(
                        f"Corrupt model file {file_path}! MD5 should be '{DICT[key][nested_key]}' but is '{md5_sum}'"
                    )
                    download = True
            else:
                logger.warning(f"Model file {file_path} does not exist.")
                download = True

    return download

download(tarball_path, cache_dir)

Download the database from the given URL using HF.

Parameters:

Name Type Description Default
tarball_path Path

The path where the downloaded tarball should be saved.

required
Source code in src/baktfold/databases/db.py
def download(tarball_path: Path, cache_dir: Path) -> None:
    """
    Download the database from the given URL using HF.

    Args:
        tarball_path (Path): The path where the downloaded tarball should be saved.
    """

    hf_tarball_path = hf_hub_download(
        repo_id="gbouras13/baktfold-db",
        repo_type="dataset",
        filename="baktfold_db.tar.gz"  ,
        cache_dir=f"{cache_dir}"
    )
    # move from cache_dir to the base
    # need to get the actual path not symlink

    real_tarball = Path(hf_tarball_path).resolve()
    tarball_path.parent.mkdir(parents=True, exist_ok=True)

    shutil.move(real_tarball, tarball_path)

    logger.info(f"Tarball saved to {tarball_path}")

download_requests(db_url, tarball_path)

Downloads a file from a given URL using the requests library.

Parameters:

Name Type Description Default
db_url str

The URL of the file to download.

required
tarball_path Path

The path to save the downloaded file.

required

Returns:

Type Description

None

Examples:

>>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))
Source code in src/baktfold/databases/db.py
def download_requests(db_url: str, tarball_path: Path):
    """
    Downloads a file from a given URL using the requests library.

    Args:
      db_url (str): The URL of the file to download.
      tarball_path (Path): The path to save the downloaded file.

    Returns:
      None

    Examples:
      >>> download_requests("https://zenodo.org/records/17347516/files/baktfold_db.tar.gz", Path("baktfold_db.tar.gz"))
    """

    headers = {
        "User-Agent": f"baktfold/{CURRENT_DB_VERSION} (contact: george.bouras@adelaide.edu.au)"
    }

    try:
        with tarball_path.open("wb") as fh_out, requests.get(
            db_url, stream=True, headers=headers
        ) as resp:
            total_length = resp.headers.get("content-length")
            if total_length is not None:  # content length header is set
                total_length = int(total_length)
            with alive_bar(total=total_length, scale="SI") as bar:
                for data in resp.iter_content(chunk_size=1024 * 1024):
                    fh_out.write(data)
                    bar(count=len(data))
    except:
        logger.error(
            f"ERROR: Could not download file from Zenodo! url={db_url}, path={tarball_path}"
        )

download_zenodo_prostT5(model_dir, logdir, threads)

Download the ProstT5 model from Zenodo

Parameters:

Name Type Description Default
db_url str

The URL of the database.

required
tarball_path Path

The path where the downloaded tarball should be saved.

required
Source code in src/baktfold/databases/db.py
def download_zenodo_prostT5(model_dir, logdir, threads):
    """
    Download the ProstT5 model from Zenodo

    Args:
        db_url (str): The URL of the database.
        tarball_path (Path): The path where the downloaded tarball should be saved.
    """

    db_url = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_url"]
    requiredmd5 = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_md5"]

    logger.info(f"Downloading ProstT5 model backup from {db_url}")

    tarball = VERSION_DICTIONARY[CURRENT_DB_VERSION]["prostt5_backup_tarball"]
    tarball_path = Path(f"{model_dir}/{tarball}")
    download_requests(db_url, tarball_path)

    md5_sum = calc_md5_sum(tarball_path)

    if md5_sum == requiredmd5:
        logger.info(f"ProstT5 model backup file download OK: {md5_sum}")
    else:
        logger.error(
            f"Error: corrupt file! MD5 should be '{requiredmd5}' but is '{md5_sum}'"
        )

    logger.info(
        f"Extracting ProstT5 model backup tarball: file={tarball_path}, output={model_dir}"
    )

    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            tar_file.extractall(path=str(model_dir))

    except OSError:
        logger.warning("Encountered OSError: {}".format(OSError))
        logger.error(f"Could not extract {tarball_path} to {model_dir}")

    tarball_path.unlink()

foldseek_makepaddedseqdb(db_dir)

Runs the Foldseek makepaddedseqdb command on a given database directory.

Parameters:

Name Type Description Default
db_dir Path

The path to the database directory.

required

Returns:

Type Description
None

None

Examples:

>>> foldseek_makepaddedseqdb(Path("baktfold_db"))
Source code in src/baktfold/databases/db.py
def foldseek_makepaddedseqdb(db_dir: Path) -> None:
    """
    Runs the Foldseek makepaddedseqdb command on a given database directory.

    Args:
      db_dir (Path): The path to the database directory.

    Returns:
      None

    Examples:
      >>> foldseek_makepaddedseqdb(Path("baktfold_db"))
    """

    dbs = ["AFDBClusters", "pdb", "cath", "swissprot"]
    logdir = Path(db_dir) / "logdir"

    for db_name in dbs:
        db_path = Path(db_dir) / db_name
        db_path_gpu = Path(db_dir) / f"{db_name}_gpu"

        foldseek_makepaddedseqdb = ExternalTool(
            tool="foldseek",
            input="",
            output="",
            params=f"makepaddedseqdb {db_path} {db_path_gpu}",
            logdir=logdir,
        )

        ExternalTool.run_tool(foldseek_makepaddedseqdb)

install_database(db_dir, foldseek_gpu, threads)

Install the baktfold database.

Parameters:

Name Type Description Default
db_dir Path

The directory where the database should be installed.

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required
threads int

Number of threads available (makes downloading faster)

required
Source code in src/baktfold/databases/db.py
def install_database(db_dir: Path, foldseek_gpu: bool, threads: int) -> None:
    """
    Install the baktfold database.

    Args:
        db_dir Path: The directory where the database should be installed.
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db
        threads int: Number of threads available (makes downloading faster)
    """

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {db_dir}.")
    downloaded_flag, gpu_flag = check_db_installation(db_dir, foldseek_gpu)
    if downloaded_flag:
        logger.info("All baktfold databases files are present")
    else:
        logger.info("Some baktfold databases files are missing")

        DICT = VERSION_DICTIONARY
        db_url = DICT[CURRENT_DB_VERSION]["db_url"]
        logger.info(f"Downloading baktfold DB")

        requiredmd5s = DICT[CURRENT_DB_VERSION]["md5"]
        tarball = DICT[CURRENT_DB_VERSION]["tarball"]

        tarball_path = Path(f"{db_dir}/{tarball}")
        logdir = Path(db_dir) / "logdir"

        try: 
            logger.info(f"Downloading from HuggingFace")
            download(tarball_path, db_dir)
        except:
            logger.warning(
                f"Could not download file from HuggingFace: path={tarball_path}"
            )
            logger.warning(f"Trying now with requests")
            download_requests(db_url, tarball_path)


        md5_sum = calc_md5_sum(tarball_path)


        if md5_sum in requiredmd5s:
            logger.info(f"baktfold database file download OK: {md5_sum}")
        else:
            logger.error(
                f"Error: corrupt database file! MD5 should be '{requiredmd5s}' but is '{md5_sum}'"
            )

        logger.info(
            f"Extracting baktfold database tarball: file={tarball_path}, output={db_dir}"
        )
        untar(tarball_path, db_dir, DICT)
        tarball_path.unlink()

    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.info("Some baktfold database files compatible with Foldseek-GPU are missing")
            logger.info("Creating them")
            foldseek_makepaddedseqdb(db_dir)

    logger.info("Database download and processing complete")

untar(tarball_path, output_path, DICT)

Extract the tarball to the output path.

Parameters:

Name Type Description Default
tarball_path Path

The path to the tarball file.

required
output_path Path

The path where the contents of the tarball should be extracted.

required
DICT dict

version dictionary

required
Source code in src/baktfold/databases/db.py
def untar(tarball_path: Path, output_path: Path, DICT: dict) -> None:
    """
    Extract the tarball to the output path.

    Args:
        tarball_path (Path): The path to the tarball file.
        output_path (Path): The path where the contents of the tarball should be extracted.
        DICT (dict): version dictionary
    """
    try:
        with tarball_path.open("rb") as fh_in, tarfile.open(
            fileobj=fh_in, mode="r:gz"
        ) as tar_file:
            tar_file.extractall(path=str(output_path))

        tarpath = Path(output_path) / DICT[CURRENT_DB_VERSION]["dir_name"]

        # Get a list of all files in the directory
        files_to_move = [f for f in tarpath.iterdir() if f.is_file()]

        # Move each file to the destination directory
        for file_name in files_to_move:
            destination_path = output_path / file_name.name
            shutil.move(file_name, destination_path)
        # remove the directory
        remove_directory(tarpath)

    except OSError:
        logger.warning("Encountered OSError: {}".format(OSError))
        logger.error(f"Could not extract {tarball_path} to {output_path}")

validate_db(database, default_dir, foldseek_gpu)

Validates the baktfold database is installed.

Parameters:

Name Type Description Default
database str

The directory where the database is installed.

required
default_dir str

Default DB location

required
foldseek_gpu bool

Whether to install foldseek-gpu compatible baktfold db

required

Returns:

Name Type Description
bool Path

True if all required files are present, False otherwise.

Source code in src/baktfold/databases/db.py
def validate_db(database: str, default_dir: str, foldseek_gpu: bool) -> Path:
    """
    Validates the baktfold database is installed.

    Args:
        database str: The directory where the database is installed.
        default_dir str: Default DB location
        foldseek_gpu bool: Whether to install foldseek-gpu compatible baktfold db

    Returns:
        bool: True if all required files are present, False otherwise.
    """
    # set default DB if not specified
    if database is not None:
        database: Path = Path(database)
    else:
        database = Path(default_dir)

    # check the database is installed
    logger.info(f"Checking baktfold database installation in {database}")
    downloaded_flag, gpu_flag = check_db_installation(database, foldseek_gpu)
    if downloaded_flag == True:
        logger.info("All baktfold databases files are present")
    else:
        if database == Path(default_dir):  # default
            logger.error(
                f"baktfold database not found. Please run baktfold install to download and install the baktfold database"
            )
        else:  # specific
            logger.error(
                f"baktfold database not found. Please run baktfold install -d {database} to download and install the baktfold database"
            )
    if foldseek_gpu:
        if gpu_flag:
            logger.info("All baktfold database files compatible with Foldseek-GPU are present")
        else:
            logger.error(
                f"baktfold database files compatible with Foldseek-GPU not found. Please run baktfold install -d {database} --foldseek-gpu"
            )


    return database

subcommand_predict(hypotheticals, cds_dict, output, prefix, cpu, omit_probs, model_dir, model_name, checkpoint_path, batch_size, save_per_residue_embeddings, save_per_protein_embeddings, threads, mask_threshold, has_duplicate_locus, gpus=None)

Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

Parameters:

Name Type Description Default
hypotheticals Dict[str, any]

feature dict for all Bakta hypothetical proteins

required
cds_dict Dict[str, any]

id:aa dictionary

required
output str

Output directory path.

required
prefix str

Prefix for output file names.

required
cpu bool

Flag indicating whether to use CPU for prediction.

required
omit_probs bool

Flag indicating whether to omit prediction probabilities from ProstT5.

required
model_dir str

Directory containing the ProstT5 model.

required
model_name str

Name of the ProstT5 model.

required
checkpoint_path Path

Path to ProstT5 CNN checkpoint.

required
batch_size int

Batch size for prediction.

required
proteins_flag bool

True if baktfold proteins-predict, false otherwise

required
save_per_residue_embeddings bool

Whether to save per residue embeddings to h5 file. Defaults to False.

required
save_per_protein_embeddings bool

Whether to save mean per protein embeddings to h5 file. Defaults to False.

required

Returns:

Name Type Description
hypotheticals Dict[str, any]

feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)

Source code in src/baktfold/subcommands/predict.py
def subcommand_predict(
    hypotheticals: dict,
    cds_dict: dict,
    output: Path,
    prefix: str,
    cpu: bool,
    omit_probs: bool,
    model_dir: Path,
    model_name: str,
    checkpoint_path: Path,
    batch_size: int,
    save_per_residue_embeddings: bool,
    save_per_protein_embeddings: bool,
    threads: int,
    mask_threshold: float,
    has_duplicate_locus: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Wrapper command for baktfold predict. Predicts embeddings using ProstT5 encoder + CNN prediction head.

    Args:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins
        cds_dict (Dict[str, any]): id:aa dictionary
        output (str): Output directory path.
        prefix (str): Prefix for output file names.
        cpu (bool): Flag indicating whether to use CPU for prediction.
        omit_probs (bool): Flag indicating whether to omit prediction probabilities from ProstT5.
        model_dir (str): Directory containing the ProstT5 model.
        model_name (str): Name of the ProstT5 model.
        checkpoint_path (Path): Path to ProstT5 CNN checkpoint.
        batch_size (int): Batch size for prediction.
        proteins_flag (bool): True if baktfold proteins-predict, false otherwise
        save_per_residue_embeddings (bool, optional): Whether to save per residue embeddings to h5 file. Defaults to False.
        save_per_protein_embeddings (bool, optional): Whether to save mean per protein embeddings to h5 file. Defaults to False.

    Returns:
        hypotheticals (Dict[str, any]): feature dict for all Bakta hypothetical proteins. Updated with ProstT5 3Di strings (unmasked)
    """

    logger.info('Predicting 3Di sequences using ProstT5')

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"

    ############
    # prostt5
    ############

    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"
    # embeddings h5 - will only be generated if flag is true
    output_h5_per_residue: Path = Path(output) / f"{prefix}_embeddings_per_residue.h5"
    output_h5_per_protein: Path = Path(output) / f"{prefix}_embeddings_per_protein.h5"

    if cpu is True:
        half_precision = False
    else:
        half_precision = True

    if omit_probs:
        output_probs = False
    else:
        output_probs = True

    prediction_dict = get_embeddings(
        hypotheticals,
        cds_dict,
        output,
        prefix,
        model_dir,
        model_name,
        checkpoint_path,
        fasta_3di,
        output_h5_per_residue,
        output_h5_per_protein,
        half_precision=half_precision,
        max_residues=5000,
        max_seq_len=1000,
        max_batch=batch_size,
        cpu=cpu,
        output_probs=output_probs,
        save_per_residue_embeddings=save_per_residue_embeddings,
        save_per_protein_embeddings=save_per_protein_embeddings,
        threads=threads,
        mask_threshold=mask_threshold,
        has_duplicate_locus=has_duplicate_locus,
        gpus=gpus,
    )

    mask_prop_threshold = mask_threshold/100

    #######
    # update the feature dict with 3Di 
    # easiest just 
    #######

    ########
    ## write the AA CDS to file
    ######


    # check all the lengths of the predictions are >0 in case of OOMs and filter out those that arent
    prediction_dict = {
                k: v for k, v in prediction_dict.items() if len(v[0]) > 0
            }


    with open(fasta_aa, "w+") as out_f:
        for cds_id, prot_seq in cds_dict.items():

            out_f.write(f">{cds_id}\n")

                # prediction_contig_dict[seq_id][2] these are teh ProstT5 confidence scores from 0-1 - need to convert to list

            try:
                # this will fail if ProstT5 OOM fails (or fails for some other reason)
                prot_seq = mask_low_confidence_aa(prot_seq, prediction_dict[cds_id][2].tolist(), threshold=mask_prop_threshold)
            except (KeyError, IndexError):
                # in that case, just return 'X' aka masked proteins
                prot_seq = "X" * len(prot_seq)

            out_f.write(f"{prot_seq}\n")


    return hypotheticals

subcommand_compare(hypotheticals, output, threads, evalue, sensitivity, database, prefix, predictions_dir, structures, structure_dir, logdir, proteins_flag, max_seqs, ultra_sensitive, extra_foldseek_params, custom_db, foldseek_gpu, custom_annotations, has_duplicate_locus, fast, gpus=None)

Compare 3Di or PDB structures to the baktfold DB

Parameters:

Name Type Description Default
hypotheticals Dict

hypothetical features dictionary

required
output Path

Path to the output directory.

required
threads int

Number of threads to use.

required
evalue float

E-value threshold.

required
card_vfdb_evalue float

E-value threshold for CARD and VFDB databases.

required
sensitivity float

Sensitivity threshold.

required
database Path

Path to the reference database.

required
prefix str

Prefix for output files.

required
predictions_dir Optional[Path]

Path to the directory containing predictions.

required
structures bool

Flag indicating whether structures files are used.

required
structure_dir Optional[Path]

Path to the directory containing structures (.pdb or .cif) files.

required
logdir Path

Path to the directory for log files.

required
proteins_flag bool

Flag indicating whether proteins are used.

required
max_seqs int

Maximum results per query sequence allowed to pass the prefilter for foldseek.

required
ultra_sensitive bool

Whether to skip foldseek prefilter for maximum sensitivity

required
extra_foldseek_params str

Extra foldseek search parameters

required
custom_db str

Custom foldseek database

required
foldseek_gpu bool

Use Foldseek-GPU acceleration and ungappedprefilter

required
custom_annotations Optional[Path]

Path to the tsv containing the custom_db annotations, 2 columns

required
has_duplicate_locus bool

If same locus tag has multiple annots (can happen in some euks)

required
fast bool

If true, skips AFDB search

required

Returns:

Name Type Description
bool bool

True if sub-databases are created successfully, False otherwise.

Source code in src/baktfold/subcommands/compare.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def subcommand_compare(
    hypotheticals: Dict,
    output: Path,
    threads: int,
    evalue: float,
    sensitivity: float,
    database: Path,
    prefix: str,
    predictions_dir: Optional[Path],
    structures: bool,
    structure_dir: Optional[Path],
    logdir: Path,
    proteins_flag: bool,
    max_seqs: int,
    ultra_sensitive: bool,
    extra_foldseek_params: str,
    custom_db: str,
    foldseek_gpu: bool,
    custom_annotations: Optional[Path],
    has_duplicate_locus: bool,
    fast: bool,
    gpus: Optional[str] = None,
) -> bool:
    """
    Compare 3Di or PDB structures to the baktfold DB

    Parameters:
        hypotheticals (Dict):  hypothetical features dictionary
        output (Path): Path to the output directory.
        threads (int): Number of threads to use.
        evalue (float): E-value threshold.
        card_vfdb_evalue (float): E-value threshold for CARD and VFDB databases.
        sensitivity (float): Sensitivity threshold.
        database (Path): Path to the reference database.
        prefix (str): Prefix for output files.
        predictions_dir (Optional[Path]): Path to the directory containing predictions.
        structures (bool): Flag indicating whether structures files are used.
        structure_dir (Optional[Path]): Path to the directory containing structures (.pdb or .cif) files.
        logdir (Path): Path to the directory for log files.
        proteins_flag (bool): Flag indicating whether proteins are used.
        max_seqs (int): Maximum results per query sequence allowed to pass the prefilter for foldseek.
        ultra_sensitive (bool): Whether to skip foldseek prefilter for maximum sensitivity
        extra_foldseek_params (str): Extra foldseek search parameters
        custom_db (str): Custom foldseek database
        foldseek_gpu (bool): Use Foldseek-GPU acceleration and ungappedprefilter
        custom_annotations (Optional[Path]): Path to the tsv containing the custom_db annotations, 2 columns 
        has_duplicate_locus (bool): If same locus tag has multiple annots (can happen in some euks)
        fast (bool): If true, skips AFDB search
    Returns:
        bool: True if sub-databases are created successfully, False otherwise.
    """


    # input predictions or structures
    if structures is False:
        # prostT5
        fasta_aa_input: Path = Path(predictions_dir) / f"{prefix}_aa.fasta"
        fasta_3di_input: Path = Path(predictions_dir) / f"{prefix}_3di.fasta"

    fasta_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    fasta_3di: Path = Path(output) / f"{prefix}_3di.fasta"

    ## copy the AA and 3Di from predictions directory 
    # if structures is false and baktfold compare is the command
    # Otherwise it will just copy itself

    if structures is False:
        if fasta_3di_input.exists():
            logger.info(
                f"Checked that the 3Di CDS file {fasta_3di_input} exists from baktfold predict"
            )
            if fasta_3di.exists() is False:
                shutil.copyfile(fasta_3di_input, fasta_3di)
        else:
            logger.error(
                f"The 3Di CDS file {fasta_3di_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
            )
        # copy the aa to file
        if fasta_aa_input.exists():
            logger.info(
                f"Checked that the AA CDS file {fasta_aa_input} exists from baktfold predict."
            )
            if fasta_aa.exists() is False:
                shutil.copyfile(fasta_aa_input, fasta_aa)
        else:
            logger.error(
                f"The AA CDS file {fasta_aa_input} does not exist. Please run baktfold predict and/or check the prediction directory {predictions_dir}"
                )

    ## write the AAs to file if structures is true because can't just copy from prediction_dir
    else:
        ## write the CDS to file
        logger.info(f"Writing the AAs to file {fasta_aa}.")

        with open(fasta_aa, "w+") as out_f:
            for entry in hypotheticals:
                if has_duplicate_locus:
                    header = f">{entry['id']}\n"
                else:
                    header = f">{entry['locus']}\n"
                seq = f"{entry['aa']}\n"
                out_f.write(header)
                out_f.write(seq)


    ############
    # create foldseek db
    ############

    foldseek_query_db_path: Path = Path(output) / "foldseek_db"
    foldseek_query_db_path.mkdir(parents=True, exist_ok=True)

    if structures is True:
        logger.info("Creating a foldseek query database from structures.")

        generate_foldseek_db_from_structures(
            fasta_aa,
            foldseek_query_db_path,
            structure_dir,
            logdir,
            prefix,
            proteins_flag,
        )
    else:
        generate_foldseek_db_from_aa_3di(
            fasta_aa, fasta_3di, foldseek_query_db_path, logdir, prefix
        )

    short_db_name = prefix

    # db search 

    database_name = "swissprot"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    #####
    # foldseek search
    #####

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_swissprot.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    swissprot_df = get_tophit(result_tsv, structures, cath=False)




    #####
    # foldseek search AFDB Clusters
    # by default yes, but not if no fast
    #####

    if not fast:

        database_name = "AFDBClusters"

        if short_db_name == database_name:
            logger.error(
                f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
            )

        query_db: Path = Path(foldseek_query_db_path) / short_db_name
        target_db: Path = Path(database) / database_name

        # make result and temp dirs
        result_db_base: Path = Path(output) / "result_db"
        result_db_base.mkdir(parents=True, exist_ok=True)
        result_db: Path = Path(result_db_base) / "result_afdb_db"

        temp_db: Path = Path(output) / "temp_db"
        temp_db.mkdir(parents=True, exist_ok=True)

        # make result tsv
        result_tsv: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"

        # run foldseek search
        run_foldseek_search(
            query_db,
            target_db,
            result_db,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )


        create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

        afdbclusters_df = get_tophit(result_tsv,structures, cath=False)

    else:
        logger.info("Skipping AFDB Clusters search as --fast specified.")

    #####
    # foldseek search pdb
    #####


    database_name = "pdb"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_pdb_db"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_pdb.tsv"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )


    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

    pdb_df = get_tophit(result_tsv,structures, cath=False)


    #####
    # foldseek search cath
    #####


    database_name = "cath"

    if short_db_name == database_name:
        logger.error(
            f"Please choose a different -p {prefix} as this conflicts with the {database_name}"
        )

    query_db: Path = Path(foldseek_query_db_path) / short_db_name
    target_db: Path = Path(database) / database_name

    # make result and temp dirs
    result_db_base: Path = Path(output) / "result_db"
    result_db_base.mkdir(parents=True, exist_ok=True)
    result_db: Path = Path(result_db_base) / "result_cath_db"
    result_db_greedy_best_hits: Path = Path(result_db_base) / "result_cath_db_greedy_best_hits"

    temp_db: Path = Path(output) / "temp_db"
    temp_db.mkdir(parents=True, exist_ok=True)

    # make result tsv
    result_tsv: Path = Path(output) / "foldseek_results_cath.tsv"
    result_greedy_tsv: Path = Path(output) /  "foldseek_results_cath_greedy_tophit"

    # run foldseek search
    run_foldseek_search(
        query_db,
        target_db,
        result_db,
        temp_db,
        threads,
        logdir,
        evalue,
        sensitivity,
        max_seqs,
        ultra_sensitive,
        extra_foldseek_params,
        foldseek_gpu,
        structures,
        gpus=gpus,
    )

    # this keeps the greedy best hits for cath
    # we actually don't keep the single tophit - multidomain/fold proteins should have multiple non-overlapping CATH hits
    # this is equivalent to using --greedy-best-hits with foldseek easy-search
    summarise_hits(result_db, result_db_greedy_best_hits, logdir, threads)

    # saves all CATH hits first
    create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)
    # save greedy CATH tophits
    create_result_tsv(query_db, target_db, result_db_greedy_best_hits, result_greedy_tsv, logdir, foldseek_gpu, structures, threads)

    # this just reads it in with appropriate headers
    cath_df = get_tophit(result_greedy_tsv, structures, cath=True)

    # write tophits
    swissprot_tophit_path: Path = Path(output) / "baktfold_swissprot_tophit.tsv"
    io.write_foldseek_tophit(swissprot_df, swissprot_tophit_path)

    if not fast:
        afdb_tophit_path: Path = Path(output) / "baktfold_afdbclusters_tophit.tsv"
        io.write_foldseek_tophit(afdbclusters_df, afdb_tophit_path)

    pdb_tophit_path: Path = Path(output) / "baktfold_pdb_tophit.tsv"
    io.write_foldseek_tophit(pdb_df, pdb_tophit_path)

    cath_tophit_path: Path = Path(output) / "baktfold_cath_tophit.tsv"
    io.write_foldseek_tophit(cath_df, cath_tophit_path)
    # remove result_greedy_tsv (identical to tophit, will make it confusing)
    remove_file(result_greedy_tsv) 

    # custom db output 

    #####
    # custom db
    #####


    if custom_db:

        try:

            logger.info(f"Foldseek will also be run against your custom database {custom_db}")
            # make result and temp dirs
            result_db_custom: Path = Path(result_db_base) / "result_db_custom"
            result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"

            run_foldseek_search(
            query_db,
            Path(custom_db),
            result_db_custom,
            temp_db,
            threads,
            logdir,
            evalue,
            sensitivity,
            max_seqs,
            ultra_sensitive,
            extra_foldseek_params,
            foldseek_gpu,
            structures,
            gpus=gpus,
        )

            create_result_tsv(query_db, Path(custom_db),
                result_db_custom,
                result_tsv_custom, logdir, foldseek_gpu, structures, threads)

            custom_df = get_tophit(result_tsv_custom,structures, cath=False)

            custom_db_tophit_path: Path = Path(output) / "baktfold_custom_db_tophit.tsv"
            io.write_foldseek_tophit(custom_df, custom_db_tophit_path)

        except:
            logger.error(f"Foldseek failed to run against your custom database {custom_db}. Please check that it is formatted correctly as a Foldseek database")


    ####
    # covvert pipe back - proteins
    # in theory, from JSON, the pipe shouldn't happen as we only take the ID.

    ####

    """
    https://github.com/steineggerlab/foldseek/blob/8dc75c74ad0eddab73cfd905963d13bf74dc012b/lib/mmseqs/src/commons/Util.cpp#L142
    const struct Databases databases[] = {
            { "uc",   2, 0}, // Uniclust
            { "cl|",   3, 1},
            { "sp|",   3, 1}, // Swiss prot
            { "tr|",   3, 1}, // trembl
            { "gb|",   3, 1}, // GenBank
            { "ref|",  4, 1}, // NCBI Reference Sequence
            { "pdb|",  4, 1}, // Brookhaven Protein Data Bank
            { "bbs|",  4, 1}, // GenInfo Backbone Id
            { "lcl|",  4, 1}, // Local Sequence identifier
            { "pir||", 5, 1}, // NBRF PIR
            { "prf||", 5, 1}, // Protein Research Foundation
            { "gnl|",  4, 2}, // General database identifier
            { "pat|",  4, 2}, // Patents
            { "gi|",   3, 3}  // NCBI GI
    };
    """


    if proteins_flag:


        pipe_warning_logged = False

        new_hypotheticals = []
        for record in hypotheticals:
            rid = record["id"]
            if not pipe_warning_logged:
                if "~PIPE~" in rid:
                    pipe_warning_logged = True
                    logger.warning("At least one input protein has | in the header. Note these will be replaced with ~PIPE~ in the raw foldseek output if you chose --keep-tmp-files")
            record["id"] = record["id"].replace("~PIPE~", "|")
            record["locus"] = record["locus"].replace("~PIPE~", "|")
            new_hypotheticals.append(record)

        hypotheticals = new_hypotheticals
        del new_hypotheticals

        ###
        # overwrite the output aa and 3dis with | if they were in the input

        if pipe_warning_logged:
            replace_pipe_in_fasta(fasta_aa)
            if not structures:
                replace_pipe_in_fasta(fasta_3di)


    ####
    # lookup
    ####

    if proteins_flag: # baktfold proteins 

        # note aas passed as hypotheticals to the overall function - so in and out as aas

        aas = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus=False)
        if not fast:
            aas = pstc.parse(aas, afdbclusters_df, 'afdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, pdb_df, 'pdb', has_duplicate_locus=False)
        aas = pstc.parse(aas, cath_df, 'cath', has_duplicate_locus=False)
        if custom_db:
            aas = pstc.parse(aas, custom_df, 'custom_db', has_duplicate_locus=False)

        # get the lookup descriptions for each of them
        # this requires the DB

        #aas = pstc.lookup(aas, Path(database), custom_annotations)
        aas = pstc.lookup_sql(aas, Path(database), threads)
        # add the custom annotations if it is provided
        if custom_annotations:
            aas = pstc.lookup_custom(aas, Path(database), custom_annotations)

        return aas

    else: # baktfold run

        # add the Swissprot and AFDB and PDB tophits to the json
        hypotheticals = pstc.parse(hypotheticals, swissprot_df, 'swissprot', has_duplicate_locus)
        if not fast:
            hypotheticals = pstc.parse(hypotheticals, afdbclusters_df, 'afdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, pdb_df, 'pdb', has_duplicate_locus)
        hypotheticals = pstc.parse(hypotheticals, cath_df, 'cath', has_duplicate_locus)
        if custom_db:
            hypotheticals = pstc.parse(hypotheticals, custom_df, 'custom_db', has_duplicate_locus)

        # get the lookup descriptions for each of them
        # hypotheticals = pstc.lookup(hypotheticals, Path(database), custom_annotations)
        hypotheticals = pstc.lookup_sql(hypotheticals, Path(database), threads)
        if custom_annotations:
            hypotheticals = pstc.lookup_custom(hypotheticals, Path(database), custom_annotations)

        return hypotheticals

move_product_to_note_if_exists(qualifiers)

If a 'product' qualifier exists, append it to 'note' and remove 'product'.

Designed for the eukaryotic entries

Parameters

dict

Feature qualifiers dictionary (values are usually lists).

Returns

None Modifies qualifiers in place.

Source code in src/baktfold/io/insdc.py
def move_product_to_note_if_exists(qualifiers):
    """
    If a 'product' qualifier exists, append it to 'note' and remove 'product'.

    Designed for the eukaryotic entries

    Parameters
    ----------
    qualifiers : dict
        Feature qualifiers dictionary (values are usually lists).

    Returns
    -------
    None
        Modifies qualifiers in place.
    """
    product = qualifiers.get("product")
    if not product:
        return

    # Ensure note exists and is a list
    if "note" not in qualifiers:
        qualifiers["note"] = []

    if isinstance(product, list):
        qualifiers["note"].extend(product)
    else:
        qualifiers["note"].append(product)

    qualifiers.pop("product", None)

revise_dbxref_insdc(dbxrefs)

Remove INSDC non-compliant DbXrefs.

Source code in src/baktfold/io/insdc.py
def revise_dbxref_insdc(dbxrefs: Sequence[str]) -> Tuple[Sequence[str], Sequence[str]]:
    """Remove INSDC non-compliant DbXrefs."""
    insdc_valid_dbxrefs = [bc.DB_XREF_UNIPROTKB, bc.DB_XREF_GO, bc.DB_XREF_PFAM, bc.DB_XREF_RFAM]
    valid_dbxrefs = []
    invalid_dbxrefs = []
    for dbxref in dbxrefs:
        if(dbxref.split(':')[0] in insdc_valid_dbxrefs):
            valid_dbxrefs.append(dbxref)
        else:
            invalid_dbxrefs.append(dbxref)
    return valid_dbxrefs, invalid_dbxrefs

revise_product_insdc(product)

Revise product name for INSDC compliant submissions

Source code in src/baktfold/io/insdc.py
def revise_product_insdc(product: str):
    """Revise product name for INSDC compliant submissions"""

    old_product = product
    if(re.search(r'(uncharacteri[sz]ed)', product, flags=re.IGNORECASE)):  # replace putative synonyms)
        product = re.sub(r'(uncharacteri[sz]ed)', 'putative', product, flags=re.IGNORECASE)
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('(') != product.count(')')):  # remove unbalanced parentheses
        product = product.replace('(', '').replace(')', '')  # ToDo: find and replace only legend parentheses
        logger.info('fix product: remove unbalanced parantheses. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('[') != product.count(']')):  # remove unbalanced brackets
        product = product.replace('[', '').replace(']', '')  # ToDo: find and replace only legend bracket
        logger.info('fix product: remove unbalanced brackets. new=%s, old=%s', product, old_product)

    return product

parse_json_input(input_path, faa_path, all_proteins, protein_json_flag)

Parses genome annotations from input JSON file.

Parameters:

Name Type Description Default
input_path str

Path to input JSON file.

required
faa_path str

Path to output file for hypothetical proteins.

required
all_proteins bool

Whether to keep all proteins or only hypothetical ones.

required
protein_json_flag bool

Whether input is protein JSON

required

Returns:

Name Type Description
tuple

A tuple containing the data, features, and whether there are duplicate locus tags.

Examples:

>>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
(data, features, False, False)
Source code in src/baktfold/io/json_in.py
def parse_json_input(input_path, faa_path, all_proteins, protein_json_flag):
    """
    Parses genome annotations from input JSON file.

    Args:
      input_path (str): Path to input JSON file.
      faa_path (str): Path to output file for hypothetical proteins.
      all_proteins (bool): Whether to keep all proteins or only hypothetical ones.
      protein_json_flag (bool): Whether input is protein JSON

    Returns:
      tuple: A tuple containing the data, features, and whether there are duplicate locus tags.

    Examples:
      >>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
      (data, features, False, False)
    """



    ############################################################################
    # Checks and configurations
    # - check parameters and setup global configuration
    # - test database
    # - test binary dependencies
    ############################################################################

    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except:
        logger.error(f'ERROR: annotation file {annotation_path} not valid!')

    #print(f'baktfold v{cfg.version}')

    logger.info(f'Parsing annotations from input: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)


    features = data['features']

    # features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    # for feature in data['features']:
    #     seq_id = feature['sequence'] if 'sequence' in feature else feature['contig']  # <1.10.0 compatibility
    #     sequence_features = features_by_sequence.get(seq_id)
    #     sequence_features.append(feature)

    # keep all proteins
    if all_proteins:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS ]
    else:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]


    if protein_json_flag: # this will also be only hypotheticals if protein mode (or else why not just run with the FASTA)
        version = data.get("version", {})
        return features, hypotheticals, version


    # check if dupe locus tags (euks can have multiple CDS same locus tag e.g. Cladocopium goreaui CAMXCT020000001.1)
    seen_loci = set()
    has_duplicate_locus = False

    for feat in hypotheticals:
        locus = feat['locus']
        if locus in seen_loci:
            has_duplicate_locus = True
            logger.warning("Multiple CDS per locus tag were detected in your input JSON.")
            logger.warning("CDS id (which is unique) rather than locus tag will be used for ProstT5+Foldseek searches.")
            break
        seen_loci.add(locus)

    # this is done after getting all the sequences into the dict for baktfold proteins

    if has_duplicate_locus:
        # write hypothetical proteins to file with id (not locus) as guaranteed exists and unique
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['id']}\n{feat['aa']}\n")

    else:
        # write hypothetical proteins to file - almost always
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['locus']}\n{feat['aa']}\n")

    # none of this is relevant for proteins
    try:
        genome_block = data.get("genome")

        if genome_block is None:
            logger.error("No 'genome' block found in input JSON. Please check.")
            translation_table = None
        else:
            if "translation_table" not in genome_block:
                logger.error("No translation table found in input JSON. Please check your input.")
            else:
                raw_value = genome_block["translation_table"]

                try:
                    translation_table = int(raw_value)
                    logger.info(
                        f"Translation table {translation_table} detected from input JSON"
                    )

                except (ValueError, TypeError):
                    translation_table = str(raw_value)
                    logger.warning(
                        f"Translation table '{raw_value}' is not an integer. "
                        f"Parsing it as a string."
                    )

    except Exception as e:
        logger.exception(
            f"Unexpected error while parsing translation table: {e}"
        )
        translation_table = None

    # input detection

    version = data.get("version", {})

    prokka = False
    other_genbank = False

    if "prokka" in version:
        prokka = True
        logger.info("Prokka input detected")
    if  "prokka"  not in version and "bakta" not in version:
        other_genbank = True

    logger.info('Parsing complete')

    return data, features, has_duplicate_locus, translation_table, prokka, other_genbank, version

Module for manipulating genbank files some taken from phynteny https://github.com/susiegriggo/Phynteny

get_genbank(genbank)

Convert a GenBank file to a dictionary.

This function reads a GenBank file and converts it into a dictionary.

Parameters:

Name Type Description Default
genbank Path

Path to the GenBank file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the GenBank file.

Raises:

Type Description
ValueError

If the provided file is not a GenBank file.

Source code in src/baktfold/io/handle_genbank.py
def get_genbank(genbank: Path) -> dict:
    """
    Convert a GenBank file to a dictionary.

    This function reads a GenBank file and converts it into a dictionary.

    Args:
        genbank (Path): Path to the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.

    Raises:
        ValueError: If the provided file is not a GenBank file.
    """

    logger.info(f"Checking if input {genbank} is a Genbank format file")
    logger.info(f"If so, also detecting the likely input style out of Pharokka, Bakta and NCBI Refseq style.")
    def parse_records(handle):
        """
    Parses a genbank file and returns a list of SeqRecords.

    Args:
      file_path (str): The path to the genbank file to parse.
      file_format (str): The format of the genbank file. Defaults to 'genbank'.

    Returns:
      list: A list of SeqRecords parsed from the genbank file.

    Examples:
      >>> parse_records('example.gb')
      [SeqRecord(seq=Seq('ATGC'), id='example', name='example', description='example', dbxrefs=[]), ...]
    """
        try:
            records = list(SeqIO.parse(handle, "gb"))
            if not records:
                return {}, None
            gb_dict = {record.id: record for record in records}
            record = records[0]

            comment = record.annotations.get("comment", "")
            cds_feature = next((f for f in record.features if f.type == "CDS"), None)

            if cds_feature is None:
                logger.error(f"{genbank} appears to be a Genbank formatted file but no CDS was found. Please check your input.")
                return gb_dict, None

            # Check if 'Bakta' appears in the Comment - will appear there
            if "Bakta" in comment and "locus_tag" in cds_feature.qualifiers:
                logger.info(f"Detected Bakta style input Genbank. Using locus_tag qualifier from Bakta as the CDS IDs for Phold.")
                method = "Bakta"
            else:
                if "phrog" not in cds_feature.qualifiers and "protein_id" in cds_feature.qualifiers:
                    logger.info(f"Detected NCBI Refseq style input Genbank. Using protein_id qualifier as the CDS IDs for Phold.")
                    method = "NCBI"
                elif "phrog" in cds_feature.qualifiers and "ID" in cds_feature.qualifiers:
                    logger.info(f"Detected Pharokka style input Genbank. Using ID qualifier from Pharokka as the CDS IDs for Phold.")
                    method = "Pharokka"
                else:
                    logger.error(
                                f"Feature {cds_feature} could not be parsed. Therefore, the input style format for {genbank} could not be detected. Please check your input."
                            )
            return identify_long_ids(gb_dict), method
        except Exception as e:
            logger.warning(f"{genbank} is not a genbank file")
            return {}, None

    try:
        if is_gzip_file(genbank.strip()):
            with gzip.open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
        else:
            with open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
    except Exception as e:
        logger.warning(f"{genbank} is not a genbank file")
        return {}, None

get_proteins(fasta)

Convert an Amino Acid FASTA file to a dictionary.

This function reads a AA FASTA file and converts it into a dictionary.

Parameters:

Name Type Description Default
fasta Path

Path to the FASTA file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the FASTA file.

Raises:

Type Description
ValueError

If the provided file is not a FASTA file.

Source code in src/baktfold/io/handle_genbank.py
def get_proteins(fasta: Path) -> dict:
    """
    Convert an Amino Acid FASTA file to a dictionary.

    This function reads a AA FASTA file and converts it into a dictionary.

    Args:
        fasta (Path): Path to the FASTA file.

    Returns:
        dict: A dictionary representation of the FASTA file.

    Raises:
        ValueError: If the provided file is not a FASTA file.
    """

    if is_gzip_file(fasta.strip()):
        try:
            fasta_dict = {}
            with gzip.open(fasta.strip(), "rt") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
            handle.close()
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    else:
        try:
            fasta_dict = {}
            with open(fasta.strip(), "rt", errors="ignore") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
            handle.close()
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    return fasta_dict

identify_long_ids(gb_dict)

Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

Parameters:

Name Type Description Default
dict

A dictionary representation of the GenBank file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the GenBank file.

Source code in src/baktfold/io/handle_genbank.py
def identify_long_ids(gb_dict: dict) -> dict:
    """

    Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

    Args:
        dict: A dictionary representation of the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.
    """

    # remove spaces in ID/locus tag
    for record_id, record in gb_dict.items():
        for cds_feature in record.features:
            try:
                # if pharokka > 54 char IDs/locus tage, phold/biopython will parse with a space
                # no spaces in
                # for really long CDS IDs (over 54 chars), a space will be introduced
                # this is because the ID will go over a second line
                # weird bug noticed it on the Mgnify contigs annotated with Pharokka
                cds_id = cds_feature.qualifiers["ID"][0]
                if len(cds_id) >= 54:
                    logger.warning(
                        f"The CDS ID is {cds_id} is longer than 54 characters. It is recommended that you use short contig headers (which will therefore lead to shorter CDS ids)."
                    )
                    cds_feature.qualifiers["ID"][0] = cds_feature.qualifiers["ID"][
                        0
                    ].replace(" ", "")
            except:
                # will be GenBank/NCBI formatted
                # ID isn't a field and should be properly formatted - famous last words probably
                continue

    return gb_dict

is_gzip_file(f)

Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

This is an elegant solution to test whether a file is gzipped by reading the first two characters. I also use a version of this in fastq_pair if you want a C version :) See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration

Parameters:

Name Type Description Default
f Path

The file to test.

required

Returns:

Name Type Description
bool bool

True if the file is gzip compressed, otherwise False.

Source code in src/baktfold/io/handle_genbank.py
def is_gzip_file(f: Path) -> bool:
    """
    Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

    This is an elegant solution to test whether a file is gzipped by reading the first two characters.
    I also use a version of this in fastq_pair if you want a C version :)
    See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration
    Args:
        f (Path): The file to test.

    Returns:
        bool: True if the file is gzip compressed, otherwise False.
    """
    with open(f, "rb") as i:
        return binascii.hexlify(i.read(2)) == b"1f8b"

open_protein_fasta_file(input_file)

Open a fasta file, whether it is gzipped or plain text.

input_file (str): The path to the fasta file, either gzipped or plain.

Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.

Source code in src/baktfold/io/handle_genbank.py
def open_protein_fasta_file(input_file: str) -> Union[IO[str], gzip.GzipFile]:
    """
    Open a fasta file, whether it is gzipped or plain text.

    Parameters:
    input_file (str): The path to the fasta file, either gzipped or plain.

    Returns:
    Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.
    """
    input_file = Path(input_file)

    if input_file.suffix == ".gz":
        return gzip.open(input_file, "rt")
    else:
        return open(input_file, "r")

add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None)

Add optional INSDC qualifiers to a feature entry dict in Bakta style.

Parameters

dict

The feature dictionary being built.

dict

The qualifiers dictionary from Bio.SeqFeature.

set or list

Qualifiers expected to be single-valued (take the first if multiple).

set or list

Qualifiers that can have multiple values (keep as list if >1, else single value).

Source code in src/baktfold/io/eukaryotic_to_json.py
def add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None):
    """
    Add optional INSDC qualifiers to a feature entry dict in Bakta style.

    Parameters
    ----------
    entry : dict
        The feature dictionary being built.
    qualifiers : dict
        The qualifiers dictionary from Bio.SeqFeature.
    single_valued : set or list
        Qualifiers expected to be single-valued (take the first if multiple).
    multi_valued : set or list
        Qualifiers that can have multiple values (keep as list if >1, else single value).
    """

    single_valued = single_valued or set()
    multi_valued = multi_valued or set()

    # Multi-valued qualifiers
    for key in multi_valued:
        vals = qualifiers.get(key)
        if vals:
            entry[key] = vals if len(vals) > 1 else vals[0]

    # Single-valued qualifiers
    for key in single_valued:
        vals = qualifiers.get(key)
        if vals:
            if key == "locus_tag":
                entry["locus"] = vals[0] # this is what bakta needs
            else:
                entry[key] = vals[0]

build_bakta_sequence_entry(rec)

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/eukaryotic_to_json.py
def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 1 for euks
    if gcode is None:
        gcode = 1 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

calc_genome_stats(records)

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig GenBank file.

Source code in src/baktfold/io/eukaryotic_to_json.py
def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
     GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

convert_assembly_gap_feature(feature, rec, id)

Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The assembly_gap feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    strand = "." # bakta uses "." for strand on gaps
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    #  may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates


    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id,
    }

    # no need to add estimated length separately - it is covered by length in the json 

    # if est_len:
    #     gap_entry["estimated_length"] = est_len

    return gap_entry

convert_cds_feature(feature, seq_record, translation_table, id)

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    qualifiers = feature.qualifiers

    # ----------- Basic qualifiers -----------
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    note = qualifiers.get("note", [None])[0]
    locus = locus_tag

    # pseudo

    protein_id = qualifiers.get("protein_id", [None])[0]

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xref = qualifiers.get("db_xref", [so.SO_CDS.id])

    # Append so.SO_CDS.id only if it’s not already present
    if so.SO_CDS.id not in db_xref:
        db_xref.append(so.SO_CDS.id)

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": db_xref,  
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "note": note,
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
        "protein_id": protein_id
    }

# Feature Key           CDS

# Definition            coding sequence; sequence of nucleotides that
#                       corresponds with the sequence of amino acids in a
#                       protein (location includes stop codon); 
#                       feature includes amino acid conceptual translation.

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /codon_start=<1 or 2 or 3>
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /exception="[exception_value]"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /protein_id="<identifier>"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /ribosomal_slippage
#                       /standard_name="text"
#                       /translation="text"
#                       /transl_except=(pos:<location>,aa:<amino_acid>)
#                       /transl_table =<integer>
#                       /trans_splicing

    multi_valued = {"EC_number", "exception", "experiment", "function",  "gene_synonym",  "inference", }
    single_valued = {"allele", "artificial_location",  "map", "number",  "old_locus_tag", "operon", "phenotype", "pseudogene", "standard_name", "transl_except", "transl_table"}

    add_optional_qualifiers(bakta_cds, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "ribosomal_slippage", "trans_splicing"]:
        if flag in qualifiers:
            bakta_cds[flag] = flag in qualifiers

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

convert_exon_feature(feature, rec, id)

Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

Parameters

Bio.SeqFeature

The exon feature from the GenBank record.

Bio.SeqRecord

The full GenBank record.

str

Unique feature ID.

Returns

dict Bakta-style exon feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_exon_feature(feature, rec, id):
    """
    Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

    Parameters
    ----------
    feature : Bio.SeqFeature
        The exon feature from the GenBank record.
    rec : Bio.SeqRecord
        The full GenBank record.
    id : str
        Unique feature ID.

    Returns
    -------
    dict
        Bakta-style exon feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    db_xrefs = qualifiers.get("db_xref", [])

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    # Extract commonly used INSDC qualifiers
    exon_entry = {
            "type": "exon",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }

    multi_valued = {"EC_number","experiment","function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"   }

    add_optional_qualifiers(exon_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            exon_entry[flag] = True

    return exon_entry

convert_gene_feature(feature, rec, id)

Convert a Funannotate GenBank gene feature to Bakta-style JSON.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The rRNA feature from the GBK.

required
rec

str The record from the GBK.

required

Returns:

Name Type Description
dict

Bakta-style rRNA feature

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_gene_feature(feature, rec, id):
    """
    Convert a Funannotate GenBank gene feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")



    gene_entry = {
        "type": "gene",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "db_xrefs": [so.SO_GENE.id], 
        "id": id,
        "locus": locus_tag
    }


# Feature Key           gene 


# Definition            region of biological interest identified as a gene 
#                       and for which a name has been assigned;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /phenotype="text"
#                       /standard_name="text"
#                       /trans_splicing


# Comment               the gene feature describes the interval of DNA that 
#                       corresponds to a genetic trait or phenotype; the feature is,
#                       by definition, not strictly bound to it's positions at the 
#                       ends;  it is meant to represent a region where the gene is 
#                       located.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(gene_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            gene_entry[flag] = flag in qualifiers

    return gene_entry

convert_mat_peptide_feature(feature, rec, id)

Convert a mat_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mat_peptide_feature(feature, rec, id):
    """
    Convert a mat_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)


    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    so_code =  so.SO_MAT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    qualifiers = feature.qualifiers


    # Extract commonly used INSDC qualifiers
    mat_peptide_entry = {
            "type": "mat_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            # Join support
            "starts": starts,
            "stops": stops,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }


# Feature Key           mat_peptide


# Definition            mature peptide or protein coding sequence; coding
#                       sequence for the mature or final peptide or protein
#                       product following post-translational modification; the
#                       location does not include the stop codon (unlike the
#                       corresponding CDS);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"EC_number","experiment", "function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"}

    add_optional_qualifiers(mat_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers) - no flags
    # for flag in ["pseudo"]:
    #     if flag in qualifiers:
    #         mat_peptide_entry[flag] = True


    #  mat_peptide     complement(join(194724303..194724321,194744661..194744721,
    #                  194746996..194747031,194750435..194750476,
    #                  194757818..194757865,194759962..194760144,
    #                  194764890..194765087,194765856..194765944,
    #                  194767641..194767743,194768400..194768583))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /product="Membrane cofactor protein. /id=PRO_0000238971"
    #                  /note="propagated from UniProtKB/Swiss-Prot (O88174.1)"

    return mat_peptide_entry

convert_misc_feature(feature, rec, id)

Convert a misc feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_feature feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_misc_feature(feature, rec, id):
    """
    Convert a misc feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_feature feature.
    """

    seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_MISC_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    misc_feature_entry = {
            "type": "misc_feature",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": db_xrefs,


        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note", "phenotype"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name",  "pseudogene"}

    add_optional_qualifiers(misc_feature_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            misc_feature_entry[flag] = True

# Feature Key           misc_feature


# Definition            region of biological interest which cannot be described
#                       by any other feature key; a new or rare feature;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /phenotype="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               this key should not be used when the need is merely to 
#                       mark a region in order to comment on it or to use it in 
#                       another feature's location

    #  misc_feature    join(78488668..78488692,78499322..78499359)
    #                  /gene="Mogat1"
    #                  /gene_synonym="0610030A14Rik; 1110064N14Rik; Dgat2l;
    #                  Dgat2l1; mDC2; MGAT1; WI1-2612I11.1"
    #                  /note="propagated from UniProtKB/Swiss-Prot (Q91ZV4.2);
    #                  transmembrane region"

    #  misc_feature    78179419..78180585
    #                  /standard_name="Pax3 upstream hypaxial enhancer"
    #                  /note="Region: biological region; Derived by automated
    #                  computational analysis using gene prediction method:
    #                  RefSeqFE."
    #                  /function="regulatory_interactions: LOC107980439 | Pax3"
    #                  /db_xref="GeneID:107980442"    

    return misc_feature_entry

convert_misc_rna_feature(feature, rec, id)

Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The assembly_gap feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.

    """

        # from ensemble genomes
        # misc_RNA        complement(437333..442742)
        #             /gene="YPL060C-A"
        #             /note="transposable_element"
        #             /standard_name="YPL060C-A"

    # Coordinates (1-based)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]

# Feature Key           misc_RNA


# Definition            any transcript or RNA product that cannot be defined by
#                       other RNA keys (prim_transcript, precursor_RNA, mRNA,
#                       5'UTR, 3'UTR, exon, CDS, sig_peptide, transit_peptide,
#                       mat_peptide, intron, polyA_site, ncRNA, rRNA and tRNA);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    misc_rna_entry = {
        "type": "misc_RNA", # expects lowercase 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "gene": gene,
        "id": id
    }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "product", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(misc_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            misc_rna_entry[flag] = flag in qualifiers

    return misc_rna_entry

convert_mobile_element_feature(feature, rec, id)

Convert a GenBank mobile_element feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mobile_element_feature(feature, rec, id):
    """
    Convert a GenBank mobile_element feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier check (INSDC requirement)
    mobile_element_type = qualifiers.get("mobile_element_type", [None])[0]
    if mobile_element_type is None:
        raise ValueError(
            f"mobile_element feature {id} is missing mandatory "
            "/mobile_element_type qualifier"
        )

    so_code =  so.SO_MOBILE_ELEMENT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


# Feature Key           mobile_element


# Definition            region of genome containing mobile elements;

# Mandatory qualifiers  /mobile_element_type="<mobile_element_type>
#                       [:<mobile_element_name>]"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /standard_name="text"


    # Extract commonly used INSDC qualifiers
    mobile_element_entry = {
            "type": "mobile_element",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs,
                    # Mandatory
            "mobile_element_type": mobile_element_type,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "standard_name", "rpt_family", "rpt_type"}

    add_optional_qualifiers(mobile_element_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["pseudo"]:
    #   if flag in qualifiers:
    #     mobile_element_entry[flag] = True


    #  mobile_element  57369551..57369723
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: RefSeqFE."
    #                  /mobile_element_type="SINE:AmnSINE1"
    #                  /db_xref="GeneID:106707176"

    return mobile_element_entry

convert_mrna_feature(feature, rec, id)

Convert a funannotate mrna feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style mRNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mrna_feature(feature, rec, id):
    """
    Convert a funannotate mrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style mRNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    else:
        starts = None
        stops = None


    qualifiers = feature.qualifiers


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")


    mrna_entry = {
        "type": "mRNA", 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "db_xrefs": [so.SO_MRNA.id],        
        "id": id,
        "locus": locus_tag
    }



# Feature Key           mRNA


# Definition            messenger RNA; includes 5'untranslated region (5'UTR),
#                       coding sequences (CDS, exon) and 3'untranslated region
#                       (3'UTR);

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "artificial_location", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(mrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            mrna_entry[flag] = flag in qualifiers

    return mrna_entry

convert_ncrna_feature(feature, rec, id)

Convert a ncrna feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_ncrna_feature(feature, rec, id):
    """
    Convert a ncrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_NCRNA_GENE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Mandatory qualifier (INSDC requirement)
    ncrna_class = qualifiers.get("ncRNA_class", [None])[0]
    if ncrna_class is None:
        raise ValueError(
            f"ncRNA feature {id} is missing mandatory /ncRNA_class qualifier"
        )

    ncrna_entry = {
        "type": "ncRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Mandatory
        "ncRNA_class": ncrna_class,

        # Multi-valued qualifiers
        "db_xrefs": db_xrefs,


    }

# Feature Key           ncRNA

# Definition            a non-protein-coding gene, other than ribosomal RNA and
#                       transfer RNA, the functional molecule of which is the RNA
#                       transcript;

# Mandatory qualifiers  /ncRNA_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

# Example               /ncRNA_class="miRNA"
#                       /ncRNA_class="siRNA"
#                       /ncRNA_class="scRNA"       

# Comment               the ncRNA feature is not used for ribosomal and transfer
#                       RNA annotation, for which the rRNA and tRNA feature keys
#                       should be used, respectively;

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(ncrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            ncrna_entry[flag] = flag in qualifiers

    #  ncRNA           join(189791085..189791793,189798997..189799081,
    #                  189819873..189820364,189821703..189822337)
    #                  /ncRNA_class="lncRNA"
    #                  /gene="Gm30446"
    #                  /product="predicted gene, 30446, transcript variant X6"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: Gnomon. Supporting evidence
    #                  includes similarity to: 100% coverage of the annotated
    #                  genomic feature by RNAseq alignments, including 2 samples
    #                  with support for all annotated introns"
    #                  /transcript_id="XR_001779629.1"
    #                  /db_xref="GeneID:102632350"
    #                  /db_xref="MGI:MGI:5589605"

    return ncrna_entry

convert_precursor_rna_feature(feature, rec, id)

Convert a GenBank precursor_RNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_precursor_rna_feature(feature, rec, id):
    """
    Convert a GenBank precursor_RNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_PRECURSOR_RNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    precursor_rna_entry = {
            "type": "precursor_RNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(precursor_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["trans_splicing"]:
        if flag in qualifiers:
            precursor_rna_entry[flag] = True

#     Feature Key           precursor_RNA


# Definition            any RNA species that is not yet the mature RNA product;
#                       may include ncRNA, rRNA, tRNA, 5' untranslated region
#                       (5'UTR), coding sequences (CDS, exon), intervening
#                       sequences (intron) and 3' untranslated region (3'UTR);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"  
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /standard_name="text"
#                       /trans_splicing


    #  precursor_RNA   194719348..194719428
    #                  /gene="Mir29b-2"
    #                  /gene_synonym="mir-29b-2; Mirn29b-2"
    #                  /product="microRNA 29b-2"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: BestRefSeq."
    #                  /transcript_id="NR_029809.1"
    #                  /db_xref="GeneID:723963"
    #                  /db_xref="MGI:MGI:3619047"
    #                  /db_xref="miRBase:MI0000712"

    return precursor_rna_entry

convert_proprotein_propeptide_feature(feature, rec, id)

Convert a proprotein or propeptide feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style proprotein feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_proprotein_propeptide_feature(feature, rec, id):
    """
    Convert a proprotein or propeptide feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style proprotein feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_PROPEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    propeptide_entry = {
        "type": "propeptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }


# Feature Key           propeptide


# Definition            propeptide coding sequence; coding sequence for the domain of a 
#                       proprotein that is cleaved to form the mature protein product.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(propeptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            propeptide_entry[flag] = True

    #  proprotein      join(171053237..171053367,171053712..171053832)
    #                  /gene="Apoa2"
    #                  /gene_synonym="Alp-2; Apo-AII; Apoa-2; ApoA-II; ApoAII;
    #                  Hdl-1"
    #                  /product="apolipoprotein A-II proprotein"  

    return propeptide_entry

convert_protein_bind_feature(feature, rec, id)

Convert a GenBank protein_bind feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_protein_bind_feature(feature, rec, id):
    """
    Convert a GenBank protein_bind feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    bound_moiety = qualifiers.get("bound_moiety", [None])[0]
    if bound_moiety is None:
        raise ValueError(
            f"protein_bind feature {id} is missing mandatory /bound_moiety qualifier"
        )

    so_code =  so.SO_PROTEINBIND.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    protein_bind_entry = {
        "type": "protein_bind",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "bound_moiety": bound_moiety,
        "db_xrefs": db_xrefs,
        "id": id,
    }


# Feature Key           protein_bind


# Definition            non-covalent protein binding site on nucleic acid;

# Mandatory qualifiers  /bound_moiety="text"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /standard_name="text"

# Comment               note that feature key regulatory with /regulatory_class="ribosome_binding_site"
#                       should be used for ribosome binding sites.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(protein_bind_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["trans_splicing"]:
    #     if flag in qualifiers:
    #         protein_bind_entry[flag] = True

    return protein_bind_entry

convert_regulatory_feature(feature, rec, id)

Convert a GenBank regulatory feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_regulatory_feature(feature, rec, id):
    """
    Convert a GenBank regulatory feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    regulatory_class = qualifiers.get("regulatory_class", [None])[0]
    if regulatory_class is None:
        raise ValueError(
            f"regulatory feature {id} is missing mandatory /regulatory_class qualifier"
        )

    so_code =  so.SO_REGULATORY_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    regulatory_entry = {
            "type": "regulatory",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "regulatory_class": regulatory_class,
            "db_xrefs": db_xrefs,
            "id": id,
        }


# Feature Key           regulatory


# Definition            any region of sequence that functions in the regulation of
#                       transcription, translation, replication, recombination, or chromatin structure;

# Mandatory qualifiers  /regulatory_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /bound_moiety="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /phenotype="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment	              This feature has replaced the following Feature Keys on 15-DEC-2014:
#                       enhancer, promoter, CAAT_signal, TATA_signal, -35_signal, -10_signal,
#                       RBS, GC_signal, polyA_signal, attenuator, terminator, misc_signal.

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "bound_moiety", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(regulatory_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            regulatory_entry[flag] = True

    #  regulatory      195030925..195032349
    #                  /regulatory_class="enhancer"
    #                  /experiment="EXISTENCE:reporter gene assay evidence
    #                  [ECO:0000049][PMID:32912294]"
    #                  /note="C2 STARR-seq-only enhancer starr_03508"
    #                  /function="activates a minimal SCP1 promoter by STARR-seq
    #                  in ground-state (2iL) and metastable (SL) mouse embryonic
    #                  stem cells {active_cell/tissue: mESC(E14 +2i+LIF or
    #                  +serum+LIF)}"
    #                  /db_xref="GeneID:131296982"

    return regulatory_entry

convert_repeat_region_feature(feature, rec, id)

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    strand = "."
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           repeat_region


# Definition            region of genome containing repeating units;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /rpt_unit_range=<base_range>
#                       /rpt_unit_seq="text"
#                       /satellite="<satellite_type>[:<class>][ <identifier>]"
#                       /standard_name="text"

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Minimal Bakta-like CRISPR structure
    repeat_region_entry = {
        "type": "repeat_region",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "LINE1" - should always be there
        "rpt_type": rpt_type,   
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat if crispr
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": db_xrefs
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"satellite", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(repeat_region_entry, qualifiers, single_valued, multi_valued)


    return repeat_region_entry

convert_rrna_feature(feature, rec, id)

Convert a GenBank rRNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_rrna_feature(feature, rec, id):
    """
    Convert a GenBank rRNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_RRNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

# Feature Key           rRNA


# Definition            mature ribosomal RNA; RNA component of the
#                       ribonucleoprotein particle (ribosome) which assembles
#                       amino acids into proteins.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               rRNA sizes should be annotated with the /product
#                       qualifier.  


    rrna_entry = {
            "type": "rRNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(rrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            rrna_entry[flag] = True


    #  rRNA            46413357..46413475
    #                  /gene="n-R5s211"
    #                  /product="5S ribosomal RNA"
    #                  /inference="COORDINATES: nucleotide
    #                  motif:Rfam:12.0:RF00001"
    #                  /inference="COORDINATES: profile:INFERNAL:1.1.1"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: cmsearch."
    #                  /transcript_id="XR_004936691.1"
    #                  /db_xref="GeneID:115487577"
    #                  /db_xref="RFAM:RF00001"
    #                  /db_xref="MGI:MGI:4422076"

    return rrna_entry

convert_sig_peptide_feature(feature, rec, id)

Convert a sig_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style sig_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_sig_peptide_feature(feature, rec, id):
    """
    Convert a sig_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style sig_peptide feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_SIGNAL_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    sig_peptide_entry = {
        "type": "sig_peptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }




# Feature Key           sig_peptide


# Definition            signal peptide coding sequence; coding sequence for an
#                       N-terminal domain of a secreted protein; this domain is
#                       involved in attaching nascent polypeptide to the
#                       membrane leader sequence;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(sig_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            sig_peptide_entry[flag] = True


    #  sig_peptide     complement(join(194768584..194768588,
    #                  194774407..194774533))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /inference="COORDINATES: ab initio prediction:SignalP:6.0"

    return sig_peptide_entry

convert_transit_peptide_feature(feature, rec, id)

Convert a transit_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style transit_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_transit_peptide_feature(feature, rec, id):
    """
    Convert a transit_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style transit_peptide feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_TRANSIT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    transit_peptide_entry = {
            "type": "transit_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": qualifiers.get("db_xref", []),

        }



# Feature Key           transit_peptide


# Definition            transit peptide coding sequence; coding sequence for an
#                       N-terminal domain of a nuclear-encoded organellar
#                       protein; this domain is involved in post-translational
#                       import of the protein into the organelle;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(transit_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            transit_peptide_entry[flag] = True

    #  transit_peptide complement(join(180006550..180006849,
    #                  180009627..180009803))
    #                  /gene="Coq8a"
    #                  /gene_synonym="4632432J16Rik; Adck3; Cabc1; mKIAA0451"
    #                  /note="Mitochondrion.
    #                  /evidence=ECO:0000250|UniProtKB:Q8NI60; propagated from
    #                  UniProtKB/Swiss-Prot (Q60936.2)"

    return transit_peptide_entry

convert_trna_feature(feature, seq_record, id)

Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_trna_feature(feature, seq_record, id):
    """
    Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))



    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]

    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna_entry = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in output
        "locus": locus_tag,
        "id": id,
    }

# Feature Key           tRNA


# Definition            mature transfer RNA, a small RNA molecule (75-85 bases
#                       long) that mediates the translation of a nucleic acid
#                       sequence into an amino acid sequence;

# Optional qualifiers   /allele="text"
#                       /anticodon=(pos:<location>,aa:<amino_acid>,seq:<text>)
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",    "old_locus_tag", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(bakta_trna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            bakta_trna_entry[flag] = flag in qualifiers

    return bakta_trna_entry

convert_utr_region_feature(feature, rec, id, three)

Convert a UTR GenBank feature to a simplified Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The UTR feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_utr_region_feature(feature, rec, id, three):
    """
    Convert a UTR GenBank feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The UTR feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style feature.
    """

    if three:
        type = "3'UTR"
        so_code =  so.SO_3UTR.id
    else:
        type = "5'UTR"
        so_code =  so.SO_5UTR.id

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # always just take the positive strand to get the NT seq (UTR region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           3'UTR


# Definition            1) region at the 3' end of a mature transcript (following 
#                       the stop codon) that is not translated into a protein;
#                       2) region at the 3' end of an RNA virus (following the last stop
#                       codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing



# Feature Key           5'UTR


# Definition            1) region at the 5' end of a mature transcript (preceding 
#                       the initiation codon) that is not translated into a protein;
#                       2) region at the 5' end of an RNA virus genome (preceding the first 
#                       initiation codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    # Minimal Bakta-like structure
    utr_entry = {
        "type": type,
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "product": note, 
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        "db_xrefs": db_xrefs,
        "locus": locus_tag
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",   "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(utr_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            utr_entry[flag] = flag in qualifiers

    return utr_entry

get_bakta_style_id_from_locus_tag(records)

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/eukaryotic_to_json.py
def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first  record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 7:

                        locus_tag_prefix = locus_tag[:-7] # trims off _000001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default  locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

random_n_letter_id(n=4)

generates a n letter id prefix

n=2 to append to locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/eukaryotic_to_json.py
def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to   locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, cds_program, trna_program, rrna_program, tmrna_program, ncrna_program, bakta_version)

Writes the bakta outputs to a given path.

Parameters:

Name Type Description Default
data dict

The dictionary containing the bakta outputs.

required
features Sequence[dict]

The sequence of dictionaries containing the features.

required
features_by_sequence Sequence[dict]

The sequence of dictionaries containing the features by sequence.

required
output Path

The path to save the bakta outputs to.

required
prefix str

The prefix to use for the bakta outputs.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
euk bool

A boolean indicating whether the sequences are eukaryotic.

required
has_duplicate_locus bool

A boolean indicating whether there are duplicate loci.

required
fast bool

If True, skips AFDB step

required
translation_table str

Translation table inferred from input JSON

required
prokka bool

boolean indicating if prokka was used to do initial annotation

required
other_genbank bool

boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation

required
bakta_version dict

Dictionary of Bakta (or whatever other program) was used for the initial annotation

required

Returns:

Type Description

None.

Examples:

>>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
Source code in src/baktfold/io/io.py
def write_bakta_outputs(data: dict, features: Sequence[dict], features_by_sequence: Sequence[dict] , 
                        output: Path, prefix: str, custom_db: bool, euk: bool, has_duplicate_locus: bool,
                        fast: bool, translation_table: int, prokka: bool, other_genbank: bool,
                        cds_program: str ,trna_program: str, rrna_program: str, tmrna_program: str, ncrna_program: str, bakta_version: dict):
    """
    Writes the bakta outputs to a given path.

    Args:
      data (dict): The dictionary containing the bakta outputs.
      features (Sequence[dict]): The sequence of dictionaries containing the features.
      features_by_sequence (Sequence[dict]): The sequence of dictionaries containing the features by sequence.
      output (Path): The path to save the bakta outputs to.
      prefix (str): The prefix to use for the bakta outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      euk (bool): A boolean indicating whether the sequences are eukaryotic.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): If True, skips AFDB step
      translation_table (str): Translation table inferred from input JSON
      prokka (bool): boolean indicating if prokka was used to do initial annotation
      other_genbank (bool): boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation
      bakta_version (dict): Dictionary of Bakta (or whatever other program) was used for the initial annotation

    Returns:
      None.

    Examples:
      >>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
    """

    #logger.info(f'selected features={len(features)}')

    logger.info('writing human readable TSV...')
    tsv_path: Path = Path(output) / f"{prefix}.tsv"
    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)

    logger.info('writing GFF3...')
    gff3_path: Path = Path(output) / f"{prefix}.gff3"
    # fix later prokka
    gff.write_features(data, features_by_sequence, gff3_path, prokka, euk, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing INSDC GenBank & EMBL...')
    genbank_path: Path = Path(output) / f"{prefix}.gbff"
    embl_path: Path = Path(output) / f"{prefix}.embl"
    insdc.write_features(data, features, genbank_path, embl_path, prokka, euk, other_genbank, translation_table, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing genome sequences...')
    fna_path: Path = Path(output) / f"{prefix}.fna"
    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)

    logger.info('writing feature nucleotide sequences...')
    ffn_path: Path = Path(output) / f"{prefix}.ffn"
    fasta.write_ffn(features, ffn_path)

    logger.info('writing translated CDS sequences...')
    faa_path: Path = Path(output) / f"{prefix}.faa"
    fasta.write_faa(features, faa_path)

    # inference here is the different databases?
    annotations_path: Path = Path(output) / f"{prefix}.inference.tsv"
    if custom_db:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    # Remove 'AFDBClusters' if fast is True
    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # flatten all features across sequences
    all_features = [
        feat
        for features in features_by_sequence.values()
        for feat in features
    ]

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')

    selected_features = []


    for seq_id, features in features_by_sequence.items():
        for feat in features:
            # get() ensures we don't crash if the key doesn't exist
            if 'hypothetical' in feat or 'baktfold' in feat:
                selected_features.append(feat)

    tsv.write_protein_features(selected_features, header_columns, annotations_path, custom_db, has_duplicate_locus, fast=fast)

    # write summary file

    write_summary_txt_file(output, prefix, all_features)

    logger.info('write machine readable JSON...')
    json_path: Path = Path(output) / f"{prefix}.json"
    json.write_json(data, features, json_path, bakta_version)

write_bakta_proteins_outputs(aas, output, prefix, custom_db, fast, bakta_version)

Writes the bakta protein outputs to a given path.

Parameters:

Name Type Description Default
aas Sequence[dict]

The sequence of dictionaries containing the amino acids.

required
output Path

The path to save the bakta protein outputs to.

required
prefix str

The prefix to use for the bakta protein outputs.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
fast bool

If True, skips AFDB step

required
bakta_version dict

Original Bakta version

required

Returns:

Type Description

None.

Examples:

>>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
Source code in src/baktfold/io/io.py
def write_bakta_proteins_outputs(aas: Sequence[dict], output: Path, prefix: str, custom_db: bool, fast: bool, bakta_version: dict):
    """
    Writes the bakta protein outputs to a given path.

    Args:
      aas (Sequence[dict]): The sequence of dictionaries containing the amino acids.
      output (Path): The path to save the bakta protein outputs to.
      prefix (str): The prefix to use for the bakta protein outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      fast (bool): If True, skips AFDB step
      bakta_version (dict): Original Bakta version

    Returns:
      None.

    Examples:
      >>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
    """

    # remove fields that were mocked to avoid baktfold crashing but not in the bakta protein JSON outputs
    fields_to_remove = ['sequence', 'start', 'stop', 'strand', 'frame']

    for aa in aas:
        for f in fields_to_remove:
            aa.pop(f, None)

    annotations_path: Path = Path(output) / f"{prefix}.tsv"
    if custom_db:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']


    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')
    tsv.write_protein_features(aas, header_columns, annotations_path, custom_db, has_duplicate_locus=False, fast=fast)


    # do i combine the tophits tsvs, sort by column, add a column for db and put out as one tsv

    full_annotations_path: Path = Path(output) / f"{prefix}.json"
    logger.info(f'Full annotations (JSON): {full_annotations_path}')
    json.write_json({'features': aas}, aas, full_annotations_path, bakta_version)


    #### don't write hyps I think as tsv

    # hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
    # header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
    # hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    # print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}')
    # tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)

    aa_output_path: Path = Path(output) / f"{prefix}.faa"
    logger.info(f'Annotated sequences (Fasta): {aa_output_path}')
    fasta.write_faa(aas, aa_output_path)

    write_summary_txt_file(output, prefix, aas)

write_foldseek_tophit(tophit_df, pdb_tophit_path)

Writes the foldseek tophits to a given path.

Parameters:

Name Type Description Default
tophit_df pd.DataFrame

The dataframe containing the foldseek tophits.

required
pdb_tophit_path Path

The path to save the foldseek tophits to.

required

Returns:

Type Description

None.

Examples:

>>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
Source code in src/baktfold/io/io.py
def write_foldseek_tophit(tophit_df: pd.DataFrame, pdb_tophit_path: Path):
    """
    Writes the foldseek tophits to a given path.

    Args:
      tophit_df (pd.DataFrame): The dataframe containing the foldseek tophits.
      pdb_tophit_path (Path): The path to save the foldseek tophits to.

    Returns:
      None.

    Examples:
      >>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
    """
    logger.info(f"Saving foldseek tophits to {pdb_tophit_path}")
    tophit_df.to_csv(pdb_tophit_path, sep="\t", index=False)

map_aa_columns(feat, custom_db, has_duplicate_locus, fast)

Maps amino acid columns.

Parameters:

Name Type Description Default
feat dict

The dictionary containing the features.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
has_duplicate_locus bool

A boolean indicating whether there are duplicate loci.

required
fast bool

A boolean indicating whether AFDBclusters Foldseek search should be skipped

required

Returns:

Type Description
Sequence[str]

Sequence[str]: A sequence of strings containing the mapped amino acid columns.

Examples:

>>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
['ABC', '100', 'protein', '', '', '', '']
Source code in src/baktfold/io/tsv.py
def map_aa_columns(feat: dict, custom_db: bool, has_duplicate_locus: bool, fast: bool) -> Sequence[str]:
    """
    Maps amino acid columns.

    Args:
      feat (dict): The dictionary containing the features.
      custom_db (bool): A boolean indicating whether a custom database is used.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): A boolean indicating whether AFDBclusters Foldseek search should be skipped

    Returns:
      Sequence[str]: A sequence of strings containing the mapped amino acid columns.

    Examples:
      >>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
      ['ABC', '100', 'protein', '', '', '', '']
    """
    # Ensure length exists
    if 'length' not in feat:
        feat['length'] = int(len(feat['nt']) / 3)

    xrefs = feat.get('db_xrefs', [])

    # Extract dbxref groups once
    def join_filtered(prefix: str, replacement: str = None):
        """
    Joins filtered database cross-references.

    Args:
      prefix (str): The prefix to filter by.
      replacement (str): The string to replace the prefix with. Defaults to None.

    Returns:
      str: The joined filtered database cross-references.

    Examples:
      >>> join_filtered('swissprot', 'afdb_v6:')
      'afdb_v6:'
    """
        if replacement is None:
            replacement = prefix
        return ','.join(
            db.replace(replacement, '') for db in xrefs
            if prefix in db
        )

    swissprot   = join_filtered('swissprot', 'afdb_v6:')
    afdbclust   = join_filtered('afdbclusters_', 'afdb_v6:')
    pdb         = join_filtered('pdb:')
    cath        = join_filtered('cath:')
    custom_refs = join_filtered('custom:', 'custom:custom_')

    # Build the output row
    row = [feat['locus']]

    # add id if multiple CDS per Locus in that record (euks)
    if has_duplicate_locus:
        row.append(feat['id'])

    row.extend([
        str(feat['length']),
        feat['product'],
        swissprot,
    ])

    # Only add AFDBClusters if not in fast mode
    if not fast:
        row.append(afdbclust)

    # Always add these
    row.extend([
        pdb,
        cath,
    ])

    if custom_db:
        row.append(custom_refs)

    return row

write_feature_inferences(sequences, features_by_sequence, tsv_path)

Export feature inference statistics in TSV format.

Source code in src/baktfold/io/tsv.py
def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export feature inference statistics in TSV format."""
    logger.info('write tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                    if('ups' in feat or 'ips' in feat):
                        query_cov = 1
                        subject_cov = 1
                        identity = 1
                        evalue = 0
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][DB_UPS_COL_UNIPARC]}"
                    elif('psc' in feat or 'pscc' in feat):
                        psc_type = 'psc' if 'psc' in feat else 'pscc'
                        query_cov = feat[psc_type]['query_cov']
                        subject_cov = feat[psc_type].get('subject_cov', -1)
                        identity = feat[psc_type]['identity']
                        score = feat[psc_type].get('score', -1)
                        evalue = feat[psc_type].get('evalue', -1)
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][DB_PSCC_COL_UNIREF50]}"
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'],
                            f"{score:0.1f}" if score != None else '-',
                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'] if 'locus' in feat else '-',
                            f"{feat['score']:0.1f}",
                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
    return

write_features(sequences, features_by_sequence, tsv_path)

Export features in TSV format.

Source code in src/baktfold/io/tsv.py
def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export features in TSV format."""
    logger.info(f'write feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                feat_type = feat['type']
                if(feat_type == bc.FEATURE_GAP):
                    feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP

                gene = feat['gene'] if feat.get('gene', None) else ''
                product = feat.get('product', '')
                if(bc.PSEUDOGENE in feat):
                    product = f"(pseudo) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_5_PRIME):
                    product = f"(5' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_3_PRIME):
                    product = f"(3' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_BOTH):
                    product = f"(partial) {product}"

                def s(x):
                    return '' if x is None else str(x)

                fh.write('\t'.join(
                    [
                        seq_id,
                        feat_type,
                        str(feat['start']),
                        str(feat['stop']),
                        str(feat['strand']),
                        s(feat.get('locus')), # handles None → ''
                        s(gene),        # handles None → ''
                        s(product),     # handles None → ''
                        ', '.join(sorted(feat.get('db_xrefs', [])))
                    ])
                )
                fh.write('\n')
                if(feat_type == bc.FEATURE_CRISPR):
                    i = 0
                    # spacers and repeats wont exist if Prokka input
                    spacers = feat.get('spacers', [])
                    repeat = feat.get('repeat', [])

                    if len(spacers) > 0 and len(repeat) > 0: 
                    # if not - will just skip
                        while i < len(feat['spacers']):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
                            spacer = feat['spacers'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                            fh.write('\n')
                            i += 1
                        if(len(feat['repeats']) - 1 == i):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
    return

write_hypotheticals(hypotheticals, tsv_path)

Export hypothetical information in TSV format.

Source code in src/baktfold/io/tsv.py
def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
    """Export hypothetical information in TSV format."""
    logger.info('write hypothetical tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold v{cfg.version}, https://github.com/oschwengers/bakta\n')
        #fh.write(f"#Database v{cfg.db_info['major']}.{cfg.db_info['minor']}, https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('#Sequence Id\tStart\tStop\tStrand\tLocus Tag\tMol Weight [kDa]\tIso El. Point\tPfam hits\tDbxrefs\n')
        for hypo in hypotheticals:
            pfams = [f"{pfam['id']}|{pfam['name']}" for pfam in hypo.get('pfams', [])]
            seq_stats = hypo['seq_stats']
            mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA'
            iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA'
            seq_id = hypo['sequence'] if 'sequence' in hypo else hypo['contig']  # <1.10.0 compatibility
            fh.write(f"{seq_id}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
    return

write_protein_features(features, header_columns, tsv_path, custom_db, has_duplicate_locus, fast)

Export protein features in TSV format.

Source code in src/baktfold/io/tsv.py
def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], tsv_path: Path, custom_db: bool, has_duplicate_locus: bool, fast: bool):
    """Export protein features in TSV format."""
    logger.info(f'write protein feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold (v{cfg.version}): https://github.com/gbouras13/baktfold\n')
        #fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('\t'.join(header_columns))
        fh.write('\n')
        for feat in features:
            columns = map_aa_columns(feat, custom_db, has_duplicate_locus, fast)
            fh.write('\t'.join(columns))
            fh.write('\n')
    return

parse_protein_input(input_path, faa_path)

handles regular FASTA and gzipped returns cds_dict

Source code in src/baktfold/io/fasta_in.py
def parse_protein_input(input_path, faa_path):
    """
    handles regular FASTA and gzipped 
    returns cds_dict
    """

    # handles regular FASTA and gzipped 
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        input_path = Path(input_path).resolve()
        fasta_flag = is_fasta(input_path)
        if fasta_flag:
            logger.info('FASTA input format detected.')
        else:
            logger.info('Bakta JSON input format detected. Only hypothetical proteins from the Bakta JSON input file will be annotated.')
    except:
        logger.error(f'ERROR: annotation file {input_path} not valid!')

    try:
        if fasta_flag:
            logger.info('Attempting to parse input protein sequences as .faa format ...')
            aas = fasta.import_sequences(input_path, False, False)
            bakta_version = {}
        else:
            logger.info('Attempting to parse input protein sequences as Bakta JSON format ...')
            aas, hypotheticals, bakta_version = parse_json_input(input_path, False, False, protein_json_flag=True)
        logger.info(f'Imported sequences={len(aas)}')
    except:
        logger.error('ERROR: wrong file format or unallowed characters in amino acid sequences!')

    mock_start = 1
    for aa in aas:  # rename and mock feature attributes to reuse existing functions
        aa['type'] = bc.FEATURE_CDS
        aa['locus'] = aa['id']
        aa['sequence'] = '-'
        aa['start'] = mock_start
        aa['stop'] = mock_start + aa['length'] - 1
        aa['strand'] = bc.STRAND_UNKNOWN
        aa['frame'] = 1
        mock_start += 100


    if fasta_flag:
        with faa_path.open('wt') as fh:
            for aa in aas:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")
    else: # write hypothetical proteins to file if JSON input
        with faa_path.open('wt') as fh:
            for aa in hypotheticals:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")

    logger.info('Parsing complete')

    return aas, bakta_version

encode_annotations(annotations)

Encodes annotations into a string.

Parameters:

Name Type Description Default
annotations dict

A dictionary containing the annotations.

required

Returns:

Name Type Description
str str

The encoded annotations.

Examples:

>>> encode_annotations({
    'ID': 'EHICP_3230_sigpep',
    'Name': 'signal peptide',
    'product': 'signal peptide',
    'score': 0.5,
    'Parent': 'EHICP_3230'
})
'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
Source code in src/baktfold/io/gff.py
def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str:
    """
    Encodes annotations into a string.

    Args:
      annotations (dict): A dictionary containing the annotations.

    Returns:
      str: The encoded annotations.

    Examples:
      >>> encode_annotations({
          'ID': 'EHICP_3230_sigpep',
          'Name': 'signal peptide',
          'product': 'signal peptide',
          'score': 0.5,
          'Parent': 'EHICP_3230'
      })
      'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
    """
    annotation_strings = []
    for key, val in annotations.items():
        if(type(val) is list):
            if(len(val) >= 1):
                val = [encode_attribute(k) for k in val]
                annotation = f"{key}={','.join(val)}"
                annotation_strings.append(annotation)
        else:
            annotation_strings.append(f'{key}={encode_attribute(val)}')
    return ';'.join(annotation_strings)

encode_attribute(product)

Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md

Source code in src/baktfold/io/gff.py
def encode_attribute(product: str) -> str:
    """Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md"""
    product = str(product)
    product = product.replace('%', '%25')
    product = product.replace(';', '%3B')
    product = product.replace('=', '%3D')
    product = product.replace('&', '%26')
    product = product.replace(',', '%2C')
    return product

write_euk_cds_feature(fh, seq_id, feat)

Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

Parameters

fh : file-handle seq_id : str

dict-like feature with keys:

"start", "stop", "strand", "locus", "starts", "stops"

Source code in src/baktfold/io/gff.py
def write_euk_cds_feature(fh, seq_id, feat):
    """
    Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

    Parameters
    ----------
    fh : file-handle
    seq_id : str
    feat : dict-like feature with keys:
            "start", "stop", "strand", "locus", "starts", "stops"
    """

    strand = feat.get("strand", "+")
    locus = feat.get("locus", "unknown")

    transcript_id = f"{locus}-T1"
    cds_id = f"{transcript_id}.cds"

    starts = feat.get("starts")
    stops = feat.get("stops")

    # -------------------------------
    # 1. Determine CDS sub-coordinates
    # -------------------------------
    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        cds_coords = list(zip(starts, stops))
    else:
        cds_coords = [(feat["start"], feat["stop"])]

    # -------------------------------
    # 2. Reverse order for negative strand
    # -------------------------------
    if strand == "-":
        cds_coords.reverse()

    # -------------------------------
    # 3. Emit CDS lines with correct phase
    # -------------------------------
    offset = 0

    for i, (cds_start, cds_stop) in enumerate(cds_coords, start=1):

        length = cds_stop - cds_start + 1
        phase = offset % 3
        offset += length

        attr = f"ID={cds_id}-{i};Parent={transcript_id}"

        fh.write(
            f"{seq_id}\tbaktfold\tCDS\t{cds_start}\t{cds_stop}"
            f"\t.\t{strand}\t{phase}\t{attr}\n"
        )

write_euk_repeat_region_feature(fh, seq_id, feat)

Writes a repeat region feature to a file.

Parameters:

Name Type Description Default
fh file

The file handle to write to.

required
seq_id str

The sequence ID.

required
feat dict

A dictionary containing the feature information.

required

Returns:

Type Description

None

Examples:

>>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
    "type": "repeat_region",
    "sequence": "DS571531.1",
    "start": 1470,
    "stop": 1716,
    "strand": "?",
    "family": "LINE2",
    "rpt_type": null,
    "repeat_unit": null,
    "product": null,
    "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
    "id": "EHICP_3230",
    "db_xrefs": [
        "SO:0000657"
    ]
})
Source code in src/baktfold/io/gff.py
def write_euk_repeat_region_feature(fh, seq_id, feat):
    """
    Writes a repeat region feature to a file.

    Args:
      fh (file): The file handle to write to.
      seq_id (str): The sequence ID.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
          "type": "repeat_region",
          "sequence": "DS571531.1",
          "start": 1470,
          "stop": 1716,
          "strand": "?",
          "family": "LINE2",
          "rpt_type": null,
          "repeat_unit": null,
          "product": null,
          "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
          "id": "EHICP_3230",
          "db_xrefs": [
              "SO:0000657"
          ]
      })
    """

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    id = feat['sequence']

    attrs = {
        "ID": f"{id}:{start}..{stop}",
        "gbkey": "repeat_region"
    }

    if feat.get('family') is not None:
        attrs["rpt_family"] = feat.get('family')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\trepeat_region\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_euk_trna_feature(fh, seq_id, feat)

Write a tRNA feature to GFF3 with a top-level line and single exon.

Parameters

file-like

Open file handle to write GFF lines.

str

Sequence/contig ID.

SeqFeature

Biopython SeqFeature object of type 'tRNA'.

Notes

  • Generates one tRNA line and one exon line.
  • Includes optional 'product' qualifier.
Source code in src/baktfold/io/gff.py
def write_euk_trna_feature(fh, seq_id, feat):
    """
    Write a tRNA feature to GFF3 with a top-level line and single exon.

    Parameters
    ----------
    fh : file-like
        Open file handle to write GFF lines.
    seq_id : str
        Sequence/contig ID.
    feat : SeqFeature
        Biopython SeqFeature object of type 'tRNA'.

    Notes
    -----
    - Generates one tRNA line and one exon line.
    - Includes optional 'product' qualifier.
    """
    start = int(feat['start'])
    stop  = int(feat['stop'])

    strand = feat['strand']

    locus = feat['locus']

    trna_id = f"{locus}-T1"

    # Top-level tRNA attributes
    attrs = {
        "ID": trna_id,
        "Parent": locus
    }

    attrs = {}

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    # Write top-level tRNA line
    fh.write(f"{seq_id}\tbaktfold\ttRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    # Write exon line (tRNA single-exon)
    exon_id = f"{trna_id}.exßon1"
    exon_attrs = f"ID={exon_id};Parent={trna_id}"
    fh.write(f"{seq_id}\tbaktfold\texon\t{start}\t{stop}\t.\t{strand}\t.\t{exon_attrs}\n")

write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

Write a 'utr' feature.

Source code in src/baktfold/io/gff.py
def write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False):
    """Write a 'utr' feature."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    locus = feat['locus']

    # Count occurrences for this locus
    count = locus_counter.get(locus, 0) + 1
    locus_counter[locus] = count

    # Construct ID with suffix -2, -3, etc.
    # For first entry we keep ID=locus (no -1)
    if count == 1:
        utr_id = locus
    else:
        utr_id = f"{locus}-{count}"

    # Top-level mRNA line
    attrs = {
        "ID": f"{utr_id}",
        "Parent": f"{locus}",
    }

# CAMXCT020000566.1	EMBL	three_prime_UTR	84568	84617	.	-	.	ID=id-C1SCF055_LOCUS8420;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr3p1%3B~source:feature;gbkey=3'UTR;locus_tag=C1SCF055_LOCUS8420
# CAMXCT020000566.1	EMBL	five_prime_UTR	136251	136259	.	-	.	ID=id-C1SCF055_LOCUS8420-2;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr5p1%3B~source:feature;gbkey=5'UTR;locus_tag=C1SCF055_LOCUS8420

    if feat.get('Note') is not None:
        attrs["Note"] = feat.get('note')


    attrs["gbkey"] = "3'UTR" if three else "5'UTR"

    if feat.get('Note') is not None:
        attrs["locus_tag"] = feat.get('locus')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    if three:
        gene_tag = 'three_prime_UTR'
    else:
        gene_tag = 'five_prime_UTR'

    fh.write(f"{seq_id}\tbaktfold\t{gene_tag}\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_features(data, features_by_sequence, gff3_path, prokka=False, euk=False, other_genbank=False, cds_tool='Prodigal:2.6', trna_program='tRNAscan-SE:2.0.12', tmrna_program='Aragorn', rrna_program='Infernal', ncrna_program='Infernal')

Export features in GFF3 format.

Source code in src/baktfold/io/gff.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path, prokka: bool = False, euk: bool = False, other_genbank: bool = False, cds_tool: str = "Prodigal:2.6", trna_program: str = "tRNAscan-SE:2.0.12", tmrna_program: str = "Aragorn", rrna_program: str = "Infernal", ncrna_program: str = "Infernal"):
    """Export features in GFF3 format."""
    logger.info(f'write features: path={gff3_path}')

    with gff3_path.open('wt') as fh:
        fh.write('##gff-version 3\n')  # GFF version
        fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version

        if(data['genome'].get('taxon', None)):  # write organism info
            fh.write(f"# organism {data['genome']['taxon']}\n")

        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')

        for seq in data['sequences']:  # write features
            if euk:
                locus_counter = {} # for UTRs

            fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region

            # write landmark region
            annotations = {
                'ID': seq['id'],
                'Name': seq['id']
            }
            if(seq['topology'] == bc.TOPOLOGY_CIRCULAR):
                annotations['Is_circular'] = 'true'
            annotations = encode_annotations(annotations)
            fh.write(f"{seq['id']}\tBaktfold\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")

            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                start = feat['start']
                stop = feat['stop']
                if('edge' in feat):
                    stop += seq['length']

                # euks
                if euk:
                    if(feat['type'] == bc.FEATURE_REPEAT):
                        write_euk_repeat_region_feature(fh, seq_id, feat)

                    if(feat['type'] == bc.FEATURE_5UTR or feat['type'] == bc.FEATURE_3UTR):
                        if feat['type'] == bc.FEATURE_3UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=True)
                        elif feat['type'] == bc.FEATURE_5UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

                if(feat['type'] == bc.FEATURE_T_RNA):

                    if euk:
                        write_euk_trna_feature(fh, seq_id, feat)
                    else:
                        trna_tool = "tRNAscan-SE"
                        if prokka:
                            trna_tool = "Aragorn"
                        if other_genbank:
                            trna_tool = trna_program

                        annotations = {
                            'ID': feat['locus'],
                            'Name': feat['product'],
                            'locus_tag': feat['locus'],
                            'product': feat['product'],
                            'Dbxref': feat.get('db_xrefs', [])
                        }
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('anti_codon', False)):
                            annotations['anti_codon'] = feat['anti_codon']
                        if(feat.get('amino_acid', False)):
                            annotations['amino_acid'] = feat['amino_acid']
                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'profile:tRNAscan:2.0'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{trna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{trna_tool}\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_TM_RNA):
                    # both prokka and bakta use Aragorn

                    tmrna_tool = "Aragorn"
                    if other_genbank:
                        tmrna_tool = tmrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('tag' in feat):
                        annotations['tag_peptide'] = feat['tag']['aa']
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['inference'] = 'profile:aragorn:1.2'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        if('tag' in feat):
                            annotations['tag_peptide'] = f"{feat['tag']['start']}..{feat['tag']['stop']}" if feat['strand'] == bc.STRAND_FORWARD else f"complement({feat['tag']['start']}..{feat['tag']['stop']})"
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_R_RNA):
                    rrna_tool = "Infernal"
                    if prokka:
                        rrna_tool = "barrnap"
                    if other_genbank:
                        rrna_tool = rrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{rrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA):
                    # both prokka and bakta use infernal for ncrna
                    ncrna_tool = "Infernal"
                    if other_genbank:
                        ncrna_tool = ncrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_NC_RNA_CLASS] = insdc.select_ncrna_class(feat)
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if(ba.RE_GENE_SYMBOL.fullmatch(feat['gene'])):  # discard non-standard ncRNA gene symbols
                            gene_annotations['gene'] = feat['gene']
                        else:
                            annotations.pop('gene', None)
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{ncrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_CRISPR):
                    crispr_tool = "PILER-CR"
                    if prokka:
                        crispr_tool = "MinCED"
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product']
                    }
                    feat_type = so.SO_CRISPR.name
                    if(cfg.compliant):
                        feat_type = bc.INSDC_FEATURE_REPEAT_REGION
                        annotations['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                        annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                        annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{crispr_tool}\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                    if(not cfg.compliant):
                        i = 0
                        # spacers and repeats wont exist if Prokka input
                        spacers = feat.get('spacers', [])
                        repeat = feat.get('repeat', [])
                        if len(spacers) > 0 and len(repeat) > 0: 
                            while i < len(feat['spacers']):
                                repeat = feat['repeats'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_repeat_{i+1}",
                                    'Parent': feat['id']
                                }
                                annotations = encode_annotations(annotations)
                                # will always be PILER here as prokka won't have any
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                                spacer = feat['spacers'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_spacer_{i+1}",
                                    'Parent': feat['id'],
                                    'sequence': spacer['sequence']
                                }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                                i += 1
                            if(len(feat['repeats']) - 1 == i):
                                repeat = feat['repeats'][i]
                                annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                elif feat['type'] == bc.FEATURE_CDS:
                    if euk:
                        write_euk_cds_feature(fh, seq_id, feat)
                    else:
                        annotations = {
                            'ID': feat.get('locus'),
                            'Name': feat.get('product'),
                            'locus_tag': feat.get('locus'),
                            'product': feat.get('product'),
                            'Dbxref': feat.get('db_xrefs', [])  # default to empty list if db_xrefs doesn't exist
                        }
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Pyrodigal'
                        if prokka: 
                            source = 'Prodigal'

                        if other_genbank:
                            source = cds_tool

                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'ab initio prediction:Pyrodigal:3.5'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                            if(ec_number is not None):
                                annotations['ec_number'] = ec_number
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        if('exception' in feat):
                            ex = feat['exception']
                            pos = f"{ex['start']}..{ex['stop']}"
                            if(feat['strand'] == bc.STRAND_REVERSE):
                                pos = f"complement({pos})"
                            annotations['transl_except']=f"(pos:{pos},aa:{ex['aa']})"
                            notes = annotations.get('Note', [])
                            notes.append(f"codon on position {ex['codon_position']} is a {ex['type']} codon")
                            if('Notes' not in annotations):
                                annotations['Note'] = notes
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                        if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                            write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_SORF):
                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if(feat.get('gene', None)):  # add gene annotation if available
                        annotations['gene'] = feat['gene']
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                        if(ec_number is not None):
                            annotations['ec_number'] = ec_number
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'inference': 'ab initio prediction:Bakta'
                        }
                        if(feat.get('gene', None)):
                            gene_annotations['gene'] = feat['gene']
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                        write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_GAP):
                    gap_tool="Bakta"
                    if prokka:
                        gap_tool="Prokka"
                    annotations = {
                        'ID': feat['id'],
                        'Name': f"gap ({feat['length']} bp)",
                        'product': f"gap ({feat['length']} bp)"
                    }
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{gap_tool}\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIC):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIV):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIT):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_GENE):
                    write_gene_feature(fh, seq_id, feat)
                elif(feat['type'] == bc.FEATURE_MRNA):
                    write_mrna_feature(fh, seq_id, feat)

        if(not cfg.compliant):
            fh.write('##FASTA\n')
            for seq in data['sequences']:  # write sequences
                fh.write(f">{seq['id']}\n")
                seq_nt = seq['nt'] if 'nt' in seq else seq['sequence']  # <1.10.0 compatibility
                fh.write(fasta.wrap_sequence(seq_nt))
    return

write_gene_feature(fh, seq_id, feat)

Write a 'gene' feature including fuzzy boundaries.

Source code in src/baktfold/io/gff.py
def write_gene_feature(fh, seq_id, feat):
    """Write a 'gene' feature including fuzzy boundaries."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    attrs = {
        "ID": f"{locus}"
    }

    if feat.get('gene') is not None:
        attrs["Name"] = feat.get('gene')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tgene\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_mrna_feature(fh, seq_id, feat)

Write mRNA + implied exons based on join() structure.

Source code in src/baktfold/io/gff.py
def write_mrna_feature(fh, seq_id, feat):
    """Write mRNA + implied exons based on join() structure."""

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    mrna_id = f"{locus}-T1"

    # Top-level mRNA line
    attrs = {
        "ID": mrna_id,
        "Parent": f"{locus}",
    }

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    # Ensure db_xrefs exists and is a list
    db_xrefs = feat.get("db_xrefs", [])

    # Access note safely
    note = feat.get("note", None)


    if db_xrefs:

        key = "Dbxref"         
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)

    if note:

        key = "note"         # <-- you must define this
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tmRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    starts = feat.get("starts")
    stops  = feat.get("stops")
    strand = feat.get("strand")
    seq_id = feat.get("sequence")

    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        # For minus strand, exons must be written in reverse order (5'→3')
        if strand == "-":
            exon_parts = list(zip(starts, stops))
        else:
            exon_parts = list(zip(starts, stops))

        # Exons must be numbered in biological order (5' to 3')
        if strand == "-":
            exon_parts = exon_parts[::-1]   # reverse order

        # Write each exon to GFF
        for idx, (ex_start, ex_stop) in enumerate(exon_parts, start=1):
            exon_id = f"{mrna_id}.exon{idx}"
            exon_attrs = f"ID={exon_id};Parent={mrna_id}"
            fh.write(
                f"{seq_id}\tbaktfold\texon\t{ex_start}\t{ex_stop}\t.\t{strand}\t.\t{exon_attrs}\n"
            )
    else:
        # Single exon (no starts/stops provided)
        exon_start = feat["start"]
        exon_stop = feat["stop"]
        exon_id = f"{mrna_id}.exon1"
        exon_attrs = f"ID={exon_id};Parent={mrna_id}"

        fh.write(
            f"{seq_id}\tbaktfold\texon\t{exon_start}\t{exon_stop}"
            f"\t.\t{feat['strand']}\t.\t{exon_attrs}\n"
        )

write_signal_peptide(fh, feat)

Writes a signal peptide feature to a file.

Parameters:

Name Type Description Default
fh file

The file handle to write to.

required
feat dict

A dictionary containing the feature information.

required

Returns:

Type Description

None

Examples:

>>> write_signal_peptide(fh, {
    'locus': 'EHICP_3230',
    'sequence': 'DS571531.1',
    'strand': '+',
    'signal_peptide': {
        'start': 1,
        'stop': 20,
        'score': 0.5
    }
})
Source code in src/baktfold/io/gff.py
def write_signal_peptide(fh, feat: dict):  # <1.10.0 compatibility
    """
    Writes a signal peptide feature to a file.

    Args:
      fh (file): The file handle to write to.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_signal_peptide(fh, {
          'locus': 'EHICP_3230',
          'sequence': 'DS571531.1',
          'strand': '+',
          'signal_peptide': {
              'start': 1,
              'stop': 20,
              'score': 0.5
          }
      })
    """
    sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
    annotations = {
        'ID': f"{feat['locus']}_sigpep",
        'Name': 'signal peptide',
        'product': 'signal peptide',
        'score': sig_peptide['score'],
        'Parent': feat['locus']
    }
    annotations = encode_annotations(annotations)
    seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
    fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")

export_sequences(sequences, fasta_path, description=False, wrap=False)

Write sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
    """Write sequences to Fasta file."""
    logger.info(f'write genome sequences: path={fasta_path}, description={description}, wrap={wrap}')

    with fasta_path.open('wt') as fh:
        for seq in sequences:
            if(description):
                fh.write(f">{seq['id']} {seq['description']}\n")
            else:
                fh.write(f">{seq['id']}\n")
            if(wrap):
                fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence']))  # <1.10.0 compatibility
            else:
                fh.write(seq['nt'])
                fh.write('\n')

import_sequences(sequences_path, is_genomic=True, is_dna=True)

Import raw sequences from Fasta file.

Source code in src/baktfold/io/fasta.py
def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
    """Import raw sequences from Fasta file."""
    sequences = []
    with xopen(str(sequences_path), threads=0) as fh:
        for record in SeqIO.parse(fh, 'fasta'):

            rid = record.id

            if "~PIPE~" in rid:
                logger.error(
                    f"Your proteins FASTA header has ~PIPE~ in the header"
                    "Please remove all instances of ~PIPE~ before running Baktfold as this creates downstream issues with Foldseek"
                )
            else:
                rid = rid.replace("|", "~PIPE~")

            sequence = {
                'id': rid,
                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else ''
            }

            raw_sequence = str(record.seq).upper()
            if('-' in raw_sequence):
                dash_count = raw_sequence.count('-')
                raw_sequence = raw_sequence.replace('-', '')
                logger.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count)
            if(is_dna):
                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                    raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
                sequence['nt'] = raw_sequence
            else:
                if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
                    raw_sequence = raw_sequence[:-1]
                    logger.warning('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence)
                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                    raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
                sequence['aa'] = raw_sequence
            sequence['length'] = len(raw_sequence)
            if(is_genomic):
                sequence['complete'] = False
                sequence['type'] = bc.REPLICON_CONTIG
                sequence['topology'] = bc.TOPOLOGY_LINEAR
            logger.info(
                f"imported: id={record.id}, length={sequence['length']}, description={sequence['description']}, genomic={is_genomic}, dna={is_dna}"
            )   
            sequences.append(sequence)
    return sequences

wrap_sequence(sequence)

Wraps a sequence into lines of 60 characters.

Parameters:

Name Type Description Default
sequence str

The sequence to wrap.

required

Returns:

Name Type Description
str

The wrapped sequence.

Examples:

>>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
'ARNDCQEGHILKMFPOSUTWYVBZXJ\n'
Notes

This function is used to format sequences in FASTA files.

Source code in src/baktfold/io/fasta.py
def wrap_sequence(sequence: str):
    """
    Wraps a sequence into lines of 60 characters.

    Args:
      sequence (str): The sequence to wrap.

    Returns:
      str: The wrapped sequence.

    Examples:
      >>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
      'ARNDCQEGHILKMFPOSUTWYVBZXJ\\n'

    Notes:
      This function is used to format sequences in FASTA files.
    """
    lines = []
    for i in range(0, len(sequence), FASTA_LINE_WRAPPING):
        lines.append(sequence[i:i + FASTA_LINE_WRAPPING])
    return '\n'.join(lines) + '\n'

write_faa(features, faa_path)

Write translated CDS sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def write_faa(features: Sequence[dict], faa_path: Path):
    """Write translated CDS sequences to Fasta file."""
    logger.info(f'write translated CDS/sORF: path={faa_path}')
    with faa_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF):
                fh.write(f">{feat['locus']} {feat['product']}\n{feat['aa']}\n")

write_ffn(features, ffn_path)

Write annotated nucleotide sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def write_ffn(features: Sequence[dict], ffn_path: Path):
    """Write annotated nucleotide sequences to Fasta file."""
    logger.info(f'write feature nucleotide sequences: path={ffn_path}')
    with ffn_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION, bc.FEATURE_CRISPR, bc.FEATURE_CDS, bc.FEATURE_SORF, bc.FEATURE_ORIC, bc.FEATURE_ORIV, bc.FEATURE_ORIT]):
                identifier = feat['locus'] if 'locus' in feat else feat['id']
                if(feat.get('product', '') != ''):
                    fh.write(f">{identifier} {feat['product']}\n{feat['nt']}\n")
                else:
                    fh.write(f">{identifier}\n{feat['nt']}\n")

Some code adapted from @mheinzinger

https://github.com/mheinzinger/ProstT5/blob/main/scripts/generate_foldseek_db.py

create_foldseek_prostt5_gpu_db(fasta_aa, foldseek_db_path, db_dir, logdir)

Convert a Foldseek DB with ProstT5 3Di predictions using Foldseek-GPU

Parameters:

Name Type Description Default
fasta_aa Path

Path to the amino-acid FASTA file.

required
foldseek_db_path Path

Path to the directory where Foldseek database will be stored.

required
db_dir Path

Path to the baktfold DB

required
logdir Path

Path to the directory where logs will be stored.

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/create_foldseek_db.py
def create_foldseek_prostt5_gpu_db(
    fasta_aa: Path, foldseek_db_path: Path, db_dir: Path, logdir: Path
) -> None:
    """
    Convert a Foldseek DB with ProstT5 3Di predictions using Foldseek-GPU

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        db_dir (Path): Path to the baktfold DB
        logdir (Path): Path to the directory where logs will be stored.
    Returns:
        None
    """

    prostt5_db_path = Path(db_dir) / "prostt5_weights"

    foldseek_createdb_prostt5 = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"createdb {fasta_aa} {foldseek_db_path}  --prostt5-model {prostt5_db_path}  ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_createdb_prostt5)

foldseek_tsv2db(in_tsv, out_db_name, db_type, logdir)

Convert a Foldseek TSV file to a Foldseek database.

Parameters:

Name Type Description Default
in_tsv Path

Path to the input TSV file.

required
out_db_name Path

Path for the output Foldseek database.

required
db_type int

Type of the output database.

required
logdir Path

Path to the directory where logs will be stored.

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/create_foldseek_db.py
def foldseek_tsv2db(
    in_tsv: Path, out_db_name: Path, db_type: int, logdir: Path
) -> None:
    """
    Convert a Foldseek TSV file to a Foldseek database.

    Args:
        in_tsv (Path): Path to the input TSV file.
        out_db_name (Path): Path for the output Foldseek database.
        db_type (int): Type of the output database.
        logdir (Path): Path to the directory where logs will be stored.

    Returns:
        None
    """
    foldseek_tsv2db = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"tsv2db {in_tsv} {out_db_name}  --output-dbtype {str(db_type)} ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_tsv2db)

generate_foldseek_db_from_aa_3di(fasta_aa, fasta_3di, foldseek_db_path, logdir, prefix)

Generate Foldseek database from amino-acid and 3Di sequences.

Parameters:

Name Type Description Default
fasta_aa Path

Path to the amino-acid FASTA file.

required
fasta_3di Path

Path to the 3Di FASTA file.

required
foldseek_db_path Path

Path to the directory where Foldseek database will be stored.

required
logdir Path

Path to the directory where logs will be stored.

required
prefix str

Prefix for the Foldseek database.

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/create_foldseek_db.py
def generate_foldseek_db_from_aa_3di(
    fasta_aa: Path, fasta_3di: Path, foldseek_db_path: Path, logdir: Path, prefix: str
) -> None:
    """
    Generate Foldseek database from amino-acid and 3Di sequences.

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        fasta_3di (Path): Path to the 3Di FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        logdir (Path): Path to the directory where logs will be stored.
        prefix (str): Prefix for the Foldseek database.

    Returns:
        None
    """
    # read in amino-acid sequences
    sequences_aa = {}
    for record in SeqIO.parse(fasta_aa, "fasta"):
        sequences_aa[record.id] = str(record.seq)

    # read in 3Di strings
    sequences_3di = {}
    for record in SeqIO.parse(fasta_3di, "fasta"):
        if not record.id in sequences_aa.keys():
            logger.warning(
                "Warning: ignoring 3Di entry {}, since it is not in the amino-acid FASTA file".format(
                    record.id
                )
            )
        else:
            sequences_3di[record.id] = str(record.seq)  #no upper if masked

    # assert that we parsed 3Di strings for all sequences in the amino-acid FASTA file
    for id in sequences_aa.keys():
        if not id in sequences_3di.keys():
            logger.warning(
                "Warning: entry {} in amino-acid FASTA file has no corresponding 3Di string".format(
                    id
                )
            )
            logger.warning("Removing: entry {} from the Foldseek database ".format(id))
            sequences_aa = {
                id: sequence
                for id, sequence in sequences_aa.items()
                if id in sequences_3di
            }

    # generate TSV file contents
    tsv_aa = ""
    tsv_3di = ""
    tsv_header = ""
    for i, id in enumerate(sequences_aa.keys()):
        tsv_aa += "{}\t{}\n".format(str(i + 1), sequences_aa[id])
        tsv_3di += "{}\t{}\n".format(str(i + 1), sequences_3di[id])
        tsv_header += "{}\t{}\n".format(str(i + 1), id)

    #### write temp tsv files

    # write TSV files
    temp_aa_tsv: Path = Path(foldseek_db_path) / "aa.tsv"
    temp_3di_tsv: Path = Path(foldseek_db_path) / "3di.tsv"
    temp_header_tsv: Path = Path(foldseek_db_path) / "header.tsv"
    with open(temp_aa_tsv, "w") as f:
        f.write(tsv_aa)
    with open(temp_3di_tsv, "w") as f:
        f.write(tsv_3di)
    with open(temp_header_tsv, "w") as f:
        f.write(tsv_header)

    # create foldseek db names

    short_db_name = f"{prefix}"
    aa_db_name: Path = Path(foldseek_db_path) / short_db_name
    tsv_db_name: Path = Path(foldseek_db_path) / f"{short_db_name}_ss"
    header_db_name: Path = Path(foldseek_db_path) / f"{short_db_name}_h"

    # create Foldseek database with foldseek tsv2db

    foldseek_tsv2db(temp_aa_tsv, aa_db_name, 0, logdir)
    foldseek_tsv2db(temp_3di_tsv, tsv_db_name, 0, logdir)
    foldseek_tsv2db(temp_header_tsv, header_db_name, 12, logdir)

    # clean up
    remove_file(temp_aa_tsv)
    remove_file(temp_3di_tsv)
    remove_file(temp_header_tsv)

generate_foldseek_db_from_structures(fasta_aa, foldseek_db_path, structure_dir, logdir, prefix, proteins_flag)

Generate Foldseek database from PDB files.

Parameters:

Name Type Description Default
fasta_aa Path

Path to the amino-acid FASTA file.

required
foldseek_db_path Path

Path to the directory where Foldseek database will be stored.

required
structure_dir Path

Path to the directory containing .pdb or .cif structure files.

required
logdir Path

Path to the directory where logs will be stored.

required
prefix str

Prefix for the Foldseek database.

required
proteins_flag bool

Flag - True if proteins-compare is run

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/create_foldseek_db.py
def generate_foldseek_db_from_structures(
    fasta_aa: Path,
    foldseek_db_path: Path,
    structure_dir: Path,
    logdir: Path,
    prefix: str,
    proteins_flag: bool,
) -> None:
    """
    Generate Foldseek database from PDB files.

    Args:
        fasta_aa (Path): Path to the amino-acid FASTA file.
        foldseek_db_path (Path): Path to the directory where Foldseek database will be stored.
        structure_dir (Path): Path to the directory containing .pdb or .cif structure files.
        logdir (Path): Path to the directory where logs will be stored.
        prefix (str): Prefix for the Foldseek database.
        proteins_flag (bool): Flag - True if proteins-compare is run

    Returns:
        None
    """

    # read in amino-acid sequences
    sequences_aa = {}
    for record in SeqIO.parse(fasta_aa, "fasta"):
        sequences_aa[record.id] = str(record.seq)

    # lists all the pdb files

    structure_files = [
        file
        for file in os.listdir(structure_dir)
        if file.endswith(".pdb") or file.endswith(".cif")
    ]

    num_structures = len(structure_files)

    num_structures = 0

    # Checks that ID is in the pdbs

    no_structure_cds_ids = []

    for cds_id in sequences_aa.keys():

        matching_files = [
            file
            for file in structure_files
            if f"{cds_id}.pdb" == file or f"{cds_id}.cif" == file
        ]

        if len(matching_files) == 1:
            num_structures += 1

        # should neve happen but in case
        if len(matching_files) > 1:
            logger.warning(f"More than 1 structures found for {cds_id}")
            logger.warning("Taking the first one")
            num_structures += 1
        elif len(matching_files) == 0:
            logger.warning(f"No structure found for {cds_id}")
            logger.warning(f"{cds_id} will be ignored in annotation")
            no_structure_cds_ids.append(cds_id)

    if num_structures == 0:
        logger.error(
            f"No structures with matching CDS ids were found at all. Check the {structure_dir} directory"
        )

    # generate the db
    short_db_name = f"{prefix}"
    structure_db_name: Path = Path(foldseek_db_path) / short_db_name
    query_structure_dir = structure_dir


    foldseek_createdb_from_structures = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"createdb {query_structure_dir} {structure_db_name} ",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_createdb_from_structures)

create_result_tsv(query_db, target_db, result_db, result_tsv, logdir, foldseek_gpu, structures, threads)

Create a TSV file containing the results of a Foldseek search.

Parameters:

Name Type Description Default
query_db Path

Path to the query database.

required
target_db Path

Path to the target database.

required
result_db Path

Path to the result database generated by the search.

required
result_tsv Path

Path to save the resulting TSV file.

required
logdir Path

Path to the directory where logs will be stored.

required
foldseek_gpu bool

Run Foldseek-GPU with accelerate ungapped prefilter

required
structures bool

Whether structures were input (not ProstT5)

required
threads int

Number of threads to use.

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/run_foldseek.py
def create_result_tsv(
    query_db: Path, target_db: Path, result_db: Path, result_tsv: Path, logdir: Path, foldseek_gpu: bool, structures: bool, threads: int
) -> None:
    """
    Create a TSV file containing the results of a Foldseek search.

    Args:
        query_db (Path): Path to the query database.
        target_db (Path): Path to the target database.
        result_db (Path): Path to the result database generated by the search.
        result_tsv (Path): Path to save the resulting TSV file.
        logdir (Path): Path to the directory where logs will be stored.
        foldseek_gpu (bool): Run Foldseek-GPU with accelerate ungapped prefilter
        structures (bool): Whether structures were input (not ProstT5)
        threads (int): Number of threads to use.

    Returns:
        None
    """
    if structures:
        format_string= "--format-output query,target,bits,fident,evalue,qstart,qend,qlen,tstart,tend,tlen,alntmscore,lddt"
    else:
        format_string = "--format-output query,target,bits,fident,evalue,qstart,qend,qlen,tstart,tend,tlen"
    if foldseek_gpu:
        target_db = f"{target_db}_gpu"


    cmd = f"convertalis {query_db} {target_db} {result_db} {result_tsv} {format_string} --threads {threads}"

    foldseek_createtsv = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
    )


    ExternalTool.run_tool(foldseek_createtsv)

Run a Foldseek search using given parameters.

Parameters:

Name Type Description Default
query_db Path

Path to the query database.

required
target_db Path

Path to the target database.

required
result_db Path

Path to store the result database.

required
temp_db Path

Path to store temporary files.

required
threads int

Number of threads to use for the search.

required
logdir Path

Path to the directory where logs will be stored.

required
evalue float

E-value threshold for the search.

required
sensitivity float

Sensitivity threshold for the search.

required
max_seqs int

Maximum results per query sequence allowed to pass the prefilter for foldseek.

required
ultra_sensitive bool

Whether to skip foldseek prefilter for maximum sensitivity

required
extra_foldseek_params str

Extra foldseek search params

required
foldseek_gpu bool

Run Foldseek-GPU with accelerate ungapped prefilter

required
structures bool

Run Foldseek with structures, not ProstT5 3Dis

required
gpus Optional[str]

Comma-separated CUDA indices (e.g. "0,2") to restrict foldseek's GPU prefilter to a subset of devices. When foldseek_gpu is True and this resolves to ≥1 CUDA device, the foldseek subprocess gets CUDA_VISIBLE_DEVICES set accordingly. None = use all visible CUDA GPUs (foldseek default). Ignored when foldseek_gpu is False.

None

Returns:

Type Description
None

None

Source code in src/baktfold/features/run_foldseek.py
def run_foldseek_search(
    query_db: Path,
    target_db: Path,
    result_db: Path,
    temp_db: Path,
    threads: int,
    logdir: Path,
    evalue: float,
    sensitivity: float,
    max_seqs: int,
    ultra_sensitive: bool,
    extra_foldseek_params: str,
    foldseek_gpu: bool,
    structures: bool,
    gpus: Optional[str] = None,
) -> None:
    """
    Run a Foldseek search using given parameters.

    Args:
        query_db (Path): Path to the query database.
        target_db (Path): Path to the target database.
        result_db (Path): Path to store the result database.
        temp_db (Path): Path to store temporary files.
        threads (int): Number of threads to use for the search.
        logdir (Path): Path to the directory where logs will be stored.
        evalue (float): E-value threshold for the search.
        sensitivity (float): Sensitivity threshold for the search.
        max_seqs (int): Maximum results per query sequence allowed to pass the prefilter for foldseek.
        ultra_sensitive (bool): Whether to skip foldseek prefilter for maximum sensitivity
        extra_foldseek_params (str): Extra foldseek search params
        foldseek_gpu (bool): Run Foldseek-GPU with accelerate ungapped prefilter
        structures (bool): Run Foldseek with structures, not ProstT5 3Dis
        gpus (Optional[str]): Comma-separated CUDA indices (e.g. "0,2") to
            restrict foldseek's GPU prefilter to a subset of devices. When
            ``foldseek_gpu`` is True and this resolves to ≥1 CUDA device,
            the foldseek subprocess gets ``CUDA_VISIBLE_DEVICES`` set
            accordingly. None = use all visible CUDA GPUs (foldseek default).
            Ignored when ``foldseek_gpu`` is False.

    Returns:
        None
    """

    if ultra_sensitive:
        cmd = f"search {query_db} {target_db} {result_db} {temp_db} --threads {str(threads)} -e {evalue} -s {sensitivity} --exhaustive-search"
    else:
        cmd = f"search {query_db} {target_db} {result_db} {temp_db} --threads {str(threads)} -e {evalue} -s {sensitivity} --max-seqs {max_seqs}"

    # support foldseek gpu only for the regular DB search for now
    if foldseek_gpu:
        cmd = f"search {query_db} {target_db}_gpu {result_db} {temp_db} --threads {str(threads)} -e {evalue}  --gpu 1 --prefilter-mode 1 --max-seqs {max_seqs}"

    if extra_foldseek_params:
        cmd += f" {extra_foldseek_params}"

    # need -a 1 to compute the alignment so tmscore and lddt can be output (if using --structures)
    if structures:
        cmd += f" -a 1"

    # Build optional env for multi-GPU foldseek. Only applies when GPU mode is
    # on; foldseek selects devices via CUDA_VISIBLE_DEVICES (per its README).
    env = None
    if foldseek_gpu and gpus is not None:
        devices = parse_gpus(cpu=False, gpus=gpus)
        cvd = cuda_visible_devices_value(devices)
        if cvd is not None:
            env = {"CUDA_VISIBLE_DEVICES": cvd}

    foldseek_search = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
        env=env,
    )

    ExternalTool.run_tool(foldseek_search)

summarise_hits(result_db, result_db_greedy_best_hits, logdir, threads)

Get all non-overlapping tophits covering a query (designed for CATH)

Parameters:

Name Type Description Default
result_db Path

Path to the result database generated by the search.

required
result_db_greedy_best_hits Path

Path to save the greedy best hits results db.

required
logdir Path

Path to the directory where logs will be stored.

required
threads int

Number of threads to use.

required

Returns:

Type Description
None

None

Source code in src/baktfold/features/run_foldseek.py
def summarise_hits(result_db: Path, result_db_greedy_best_hits: Path, logdir: Path, threads: int) -> None:
    """
    Get all non-overlapping tophits covering a query (designed for CATH)

    Args:
        result_db (Path): Path to the result database generated by the search.
        result_db_greedy_best_hits (Path): Path to save the greedy best hits results db.
        logdir (Path): Path to the directory where logs will be stored.
        threads (int): Number of threads to use.

    Returns:
        None
    """

    cmd = f"summarizeresult  {result_db} {result_db_greedy_best_hits} --threads {threads} -a 1"

    foldseek_summarizeresult = ExternalTool(
        tool="foldseek",
        input=f"",
        output=f"",
        params=f"{cmd}",
        logdir=logdir,
    )

    ExternalTool.run_tool(foldseek_summarizeresult)

3Di prediction for baktfold — wraps pholdlib's shared inference engine.

Baktfold-specific: flat cds_dict (no contig nesting), Bakta hypotheticals format with in-place annotation updates, has_duplicate_locus support.

Code adapted from @mheinzinger https://github.com/mheinzinger/ProstT5/blob/main/scripts/predict_3Di_encoderOnly.py

get_embeddings(hypotheticals, cds_dict, out_path, prefix, model_dir, model_name, checkpoint_path, output_3di, output_h5_per_residue, output_h5_per_protein, half_precision, max_residues=100000, max_seq_len=30000, max_batch=10000, cpu=False, output_probs=True, save_per_residue_embeddings=False, save_per_protein_embeddings=False, threads=1, mask_threshold=0, has_duplicate_locus=False, gpus=None)

Run ProstT5 + CNN 3Di prediction for all sequences in cds_dict.

Parameters:

Name Type Description Default
hypotheticals List[Dict]

List of Bakta feature dicts (mutated in-place with "3di").

required
cds_dict Dict[str, str]

Flat {seq_id: amino_acid_str} dict.

required
out_path Path

Directory for output files.

required
prefix str

Filename prefix for CSV / JSONL outputs.

required
model_dir Path

Directory where ProstT5 is cached.

required
model_name str

HuggingFace model identifier.

required
checkpoint_path Path

Path to the CNN .pt checkpoint.

required
output_3di Path

Output FASTA path for 3Di sequences.

required
output_h5_per_residue Path

HDF5 path for per-residue embeddings.

required
output_h5_per_protein Path

HDF5 path for per-protein embeddings.

required
half_precision bool

If True, cast model + predictor to fp16 after loading.

required
max_residues int

Max total residues per inference batch.

100000
max_seq_len int

Sequences longer than this flush a batch immediately.

30000
max_batch int

Max sequences per batch.

10000
cpu bool

Force CPU inference.

False
output_probs bool

Whether to write per-residue probability JSONL.

True
save_per_residue_embeddings bool

Save per-residue HDF5.

False
save_per_protein_embeddings bool

Save per-protein HDF5.

False
threads int

Number of CPU threads for torch.

1
mask_threshold float

Residues with max softmax prob < threshold/100 → 'X'.

0
has_duplicate_locus bool

If True use feat["id"] rather than feat["locus"].

False
gpus Optional[str]

Comma-separated CUDA indices (e.g. "0,2"). None = auto-detect all visible CUDA GPUs. Overridden by cpu=True.

None

Returns:

Name Type Description
predictions Dict

Flat {seq_id: (pred, mean_prob, all_prob)} dict, in original cds_dict key order.

Source code in src/baktfold/features/predict_3Di.py
def get_embeddings(
    hypotheticals: List[Dict],
    cds_dict: Dict[str, str],
    out_path: Path,
    prefix: str,
    model_dir: Path,
    model_name: str,
    checkpoint_path: Path,
    output_3di: Path,
    output_h5_per_residue: Path,
    output_h5_per_protein: Path,
    half_precision: bool,
    max_residues: int = 100000,
    max_seq_len: int = 30000,
    max_batch: int = 10000,
    cpu: bool = False,
    output_probs: bool = True,
    save_per_residue_embeddings: bool = False,
    save_per_protein_embeddings: bool = False,
    threads: int = 1,
    mask_threshold: float = 0,
    has_duplicate_locus: bool = False,
    gpus: Optional[str] = None,
) -> Dict:
    """Run ProstT5 + CNN 3Di prediction for all sequences in *cds_dict*.

    Args:
        hypotheticals: List of Bakta feature dicts (mutated in-place with "3di").
        cds_dict: Flat ``{seq_id: amino_acid_str}`` dict.
        out_path: Directory for output files.
        prefix: Filename prefix for CSV / JSONL outputs.
        model_dir: Directory where ProstT5 is cached.
        model_name: HuggingFace model identifier.
        checkpoint_path: Path to the CNN ``.pt`` checkpoint.
        output_3di: Output FASTA path for 3Di sequences.
        output_h5_per_residue: HDF5 path for per-residue embeddings.
        output_h5_per_protein: HDF5 path for per-protein embeddings.
        half_precision: If True, cast model + predictor to fp16 after loading.
        max_residues: Max total residues per inference batch.
        max_seq_len: Sequences longer than this flush a batch immediately.
        max_batch: Max sequences per batch.
        cpu: Force CPU inference.
        output_probs: Whether to write per-residue probability JSONL.
        save_per_residue_embeddings: Save per-residue HDF5.
        save_per_protein_embeddings: Save per-protein HDF5.
        threads: Number of CPU threads for torch.
        mask_threshold: Residues with max softmax prob < threshold/100 → 'X'.
        has_duplicate_locus: If True use feat["id"] rather than feat["locus"].
        gpus: Comma-separated CUDA indices (e.g. "0,2"). None = auto-detect
              all visible CUDA GPUs. Overridden by ``cpu=True``.

    Returns:
        predictions: Flat ``{seq_id: (pred, mean_prob, all_prob)}`` dict,
                     in original cds_dict key order.
    """
    # ── resolve devices ─────────────────────────────────────────────────────
    devices = parse_gpus(cpu, gpus)
    logger.info(f"Beginning ProstT5 predictions on device(s): {devices}")
    if half_precision and devices == ["cpu"]:
        logger.info("CPU device — forcing full-precision (half-precision disabled).")
        half_precision = False
    if half_precision:
        logger.info("Using models in half-precision")
    else:
        logger.info("Using models in full-precision")

    # ── build seq_dict (skip empty / non-string entries) ────────────────────
    original_keys = list(cds_dict.keys())
    seq_dict: List[Tuple] = []
    fail_ids: List[str] = []

    for k, seq in cds_dict.items():
        if isinstance(seq, str) and seq:
            seq_dict.append((k, seq, len(seq)))
        else:
            logger.warning(
                f"Protein header {k} is corrupt or empty — will be saved in fails.tsv"
            )
            fail_ids.append(k)

    # sort descending by length (minimises padding in each batch)
    seq_dict.sort(key=lambda x: x[2], reverse=True)

    # ── run shared inference engine (single- or multi-GPU) ──────────────────
    predictions, emb_res, emb_prot, inf_fail_ids = run_prostt5_inference_multi_gpu(
        seq_dict,
        devices=devices,
        model_dir=model_dir,
        model_name=model_name,
        checkpoint_path=checkpoint_path,
        half_precision=half_precision,
        threads=threads,
        check_fn=check_prostT5_download,
        zenodo_fn=download_zenodo_prostT5,
        max_residues=max_residues,
        max_seq_len=max_seq_len,
        max_batch=max_batch,
        output_probs=output_probs,
        save_per_residue_embeddings=save_per_residue_embeddings,
        save_per_protein_embeddings=save_per_protein_embeddings,
        desc="Predicting 3Di",
    )
    fail_ids.extend(inf_fail_ids)

    # restore original key order
    predictions = {k: predictions[k] for k in original_keys if k in predictions}

    # ── write outputs ────────────────────────────────────────────────────────
    if fail_ids:
        write_fail_ids(fail_ids, Path(out_path) / "fails.tsv")

    write_predictions(
        hypotheticals, predictions, output_3di, mask_threshold, has_duplicate_locus
    )

    if save_per_residue_embeddings:
        write_embeddings(emb_res, output_h5_per_residue)
    if save_per_protein_embeddings:
        write_embeddings(emb_prot, output_h5_per_protein)

    mean_probs_path = Path(out_path) / f"{prefix}_prostT5_3di_mean_probabilities.csv"
    all_probs_path = (
        Path(out_path) / f"{prefix}_prostT5_3di_all_probabilities.json"
        if output_probs else None
    )
    write_probs(predictions, mean_probs_path, all_probs_path, original_keys)

    return predictions

write_embeddings(embeddings, out_path)

Write per-residue or per-protein embeddings to HDF5 (flat key structure).

Source code in src/baktfold/features/predict_3Di.py
def write_embeddings(embeddings: Dict[str, Any], out_path: Path) -> None:
    """Write per-residue or per-protein embeddings to HDF5 (flat key structure)."""
    with h5py.File(str(out_path), "w") as hf:
        for sequence_id, embedding in embeddings.items():
            hf.create_dataset(sequence_id, data=embedding)

write_predictions(hypotheticals, predictions, out_path, mask_threshold, has_duplicate_locus=False)

Write 3Di predictions to FASTA and update Bakta hypotheticals in-place.

Parameters:

Name Type Description Default
hypotheticals List[Dict]

List of Bakta feature dicts. Each is mutated in-place with a "3di" key set to the predicted 3Di string (or None if prediction failed / was skipped).

required
predictions Dict[str, Tuple]

Flat {seq_id: (pred, mean_prob, all_prob)} dict.

required
out_path Path

Output FASTA path.

required
mask_threshold float

Residues with max softmax prob (0–100) below this threshold are replaced with 'X'.

required
has_duplicate_locus bool

If True, use feat["id"] as seq_id (needed for eukaryotic inputs that may have duplicate locus tags). Otherwise use feat["locus"].

False
Source code in src/baktfold/features/predict_3Di.py
def write_predictions(
    hypotheticals: List[Dict],
    predictions: Dict[str, Tuple],
    out_path: Path,
    mask_threshold: float,
    has_duplicate_locus: bool = False,
) -> None:
    """Write 3Di predictions to FASTA and update Bakta hypotheticals in-place.

    Args:
        hypotheticals: List of Bakta feature dicts. Each is mutated in-place
                       with a ``"3di"`` key set to the predicted 3Di string
                       (or None if prediction failed / was skipped).
        predictions: Flat ``{seq_id: (pred, mean_prob, all_prob)}`` dict.
        out_path: Output FASTA path.
        mask_threshold: Residues with max softmax prob (0–100) below this
                        threshold are replaced with 'X'.
        has_duplicate_locus: If True, use ``feat["id"]`` as seq_id (needed for
                             eukaryotic inputs that may have duplicate locus tags).
                             Otherwise use ``feat["locus"]``.
    """
    mask_prop = mask_threshold / 100

    # drop zero-length predictions (issue #47)
    predictions = {k: v for k, v in predictions.items() if len(v[0]) > 0}

    # apply confidence masking in-place on pred index arrays
    for seq_id, (pred, mean_prob, all_prob) in predictions.items():
        for i in range(len(pred)):
            if all_prob[0][i] < mask_prop:
                pred[i] = 20  # 'X'

    with open(out_path, "w+") as out_f:
        for feat in hypotheticals:
            seq_id = feat["id"] if has_duplicate_locus else feat["locus"]
            pred_tuple = predictions.get(seq_id)
            if pred_tuple is not None:
                yhats = pred_tuple[0]
                threedi_seq = "".join(SS_MAPPING[int(y)] for y in yhats)
                feat["3di"] = threedi_seq  # mutate Bakta feature dict in-place
                out_f.write(f">{seq_id}\n{threedi_seq}\n")
            else:
                feat["3di"] = None  # no prediction (OOM / corrupt entry)

    logger.info(f"Finished writing 3Di FASTA to {out_path}")

autotune_batching_real_data(model_dir, model_name, cpu, threads, probe_seqs, start_bs=1, max_bs=100, step=5, device=None)

Autotunes the batch size for a given model and set of sequences.

Parameters:

Name Type Description Default
model_dir str

The directory where the model is stored.

required
model_name str

The name of the model.

required
cpu bool

Whether to use the CPU or not.

required
threads int

The number of threads to use.

required
probe_seqs list

A list of sequences to use for probing.

required
start_bs int

The starting batch size to use.

1
max_bs int

The maximum batch size to use.

100
step int

The step size to use when increasing the batch size.

5
device Optional[str]

Torch device string (e.g. "cuda:1") to pin autotune to a specific GPU. None preserves the original auto-detection behaviour. Used by the multi-GPU caller.

None

Returns:

Name Type Description
int

The optimal batch size.

int

The maximum number of residues per batch.

Examples:

>>> autotune_batching_real_data("model_dir", "model_name", True, 4, ["ATCG", "GCTA"], 1, 100, 5)
(10, 100)
Source code in src/baktfold/features/autotune.py
def autotune_batching_real_data(
    model_dir,
    model_name,
    cpu,
    threads,
    probe_seqs,
    start_bs=1,
    max_bs=100,
    step=5, # step size
    device: Optional[str] = None,
):
    """
    Autotunes the batch size for a given model and set of sequences.

    Args:
      model_dir (str): The directory where the model is stored.
      model_name (str): The name of the model.
      cpu (bool): Whether to use the CPU or not.
      threads (int): The number of threads to use.
      probe_seqs (list): A list of sequences to use for probing.
      start_bs (int): The starting batch size to use.
      max_bs (int): The maximum batch size to use.
      step (int): The step size to use when increasing the batch size.
      device (Optional[str]): Torch device string (e.g. "cuda:1") to pin
        autotune to a specific GPU. None preserves the original
        auto-detection behaviour. Used by the multi-GPU caller.

    Returns:
      int: The optimal batch size.
      int: The maximum number of residues per batch.

    Examples:
      >>> autotune_batching_real_data("model_dir", "model_name", True, 4, ["ATCG", "GCTA"], 1, 100, 5)
      (10, 100)
    """

    model, tokenizer, device = get_T5_model(
        model_dir, model_name, cpu, threads, device=device
    )
    model.eval()
    model.half()

    bs = start_bs
    results = []


    while bs <= max_bs:
        try:

            # seqs = probe_seqs
            n_tokens = sum(len(s) for s in probe_seqs)

            logger.info(f"Running with batch size {bs}")

            model.eval()

            total_tokens = 0
            total_time = 0.0
            batches = 0

            # iterate over real sequences in batches
            for i in tqdm(range(0, len(probe_seqs), bs), desc="Processing"):
                batch_seqs = probe_seqs[i : i + bs]

                n_tokens = sum(len(s) for s in batch_seqs)
                total_tokens += n_tokens

                inputs = tokenizer(
                    batch_seqs,
                    padding=True,
                    return_tensors="pt",
                )
                inputs.pop("token_type_ids", None)
                inputs = {k: v.to(device) for k, v in inputs.items()}

                # timing — device_synchronize handles CUDA/MPS/XPU/CPU (PR #129)
                device_synchronize(device)
                t0 = time.perf_counter()
                with torch.no_grad():
                    _ = model(**inputs)
                device_synchronize(device)

                total_time += time.perf_counter() - t0

                batches += 1

            time_per_token = total_time / total_tokens


            token_per_batch = math.floor(total_tokens / batches)


            results.append({
                "bs": bs,
                "tokens_per_batch": token_per_batch,
                "time": total_time,
                "time_per_token": time_per_token,
            })

            logger.info(f"Time elapsed {round(total_time,5)}")
            logger.info(f"Tokens per batch {token_per_batch}")

            bs += step

        except (torch.cuda.OutOfMemoryError, RuntimeError):
            # RuntimeError covers XPU/MPS OOM; torch.cuda.OutOfMemoryError covers CUDA.
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            break


    if not results:
        raise RuntimeError("No batch size fits on this GPU")

    best_entry = min(results, key=lambda x: x["time_per_token"])

    best_bs = best_entry["bs"]
    best_residues = best_entry["tokens_per_batch"]
    # best_tpt = best_bs["time_per_token"]

    logger.info(f"##########################")
    logger.info(f"Best batch size: {best_bs}")
    # logger.info(f"best max residues: {best_residues}")

    return best_bs, best_residues

run_autotune(input_path, model_dir, model_name, cpu, threads, step, min_batch, max_batch, sample_seqs, gpus=None)

Runs the batch size autotuning process.

Parameters:

Name Type Description Default
input_path str

The path to the input file.

required
model_dir str

The directory where the model is stored.

required
model_name str

The name of the model.

required
cpu bool

Whether to use the CPU or not.

required
threads int

The number of threads to use.

required
step int

The step size to use when increasing the batch size.

required
min_batch int

The minimum batch size to use.

required
max_batch int

The maximum batch size to use.

required
sample_seqs int

The number of sequences to sample for probing.

required
gpus Optional[str]

Comma-separated CUDA indices (e.g. "0,2"). When set, autotune runs on the lowest selected index. Default None = existing behaviour (cuda:0 / mps / xpu / cpu auto-detect).

None

Returns:

Name Type Description
int

The optimal batch size.

Examples:

>>> run_autotune("input_path", "model_dir", "model_name", True, 4, 5, 1, 100, 10)
10
Source code in src/baktfold/features/autotune.py
def run_autotune(
    input_path,
    model_dir,
    model_name,
    cpu,
    threads,
    step,
    min_batch,
    max_batch,
    sample_seqs,
    gpus: Optional[str] = None,
):
    """
    Runs the batch size autotuning process.

    Args:
      input_path (str): The path to the input file.
      model_dir (str): The directory where the model is stored.
      model_name (str): The name of the model.
      cpu (bool): Whether to use the CPU or not.
      threads (int): The number of threads to use.
      step (int): The step size to use when increasing the batch size.
      min_batch (int): The minimum batch size to use.
      max_batch (int): The maximum batch size to use.
      sample_seqs (int): The number of sequences to sample for probing.
      gpus (Optional[str]): Comma-separated CUDA indices (e.g. "0,2"). When
        set, autotune runs on the lowest selected index. Default None =
        existing behaviour (cuda:0 / mps / xpu / cpu auto-detect).

    Returns:
      int: The optimal batch size.

    Examples:
      >>> run_autotune("input_path", "model_dir", "model_name", True, 4, 5, 1, 100, 10)
      10
    """

    # Resolve devices early so we can pick the autotune GPU (homogeneous-card
    # assumption: same batch size applies to every GPU we'll later use).
    devices = parse_gpus(cpu, gpus)
    autotune_device: Optional[str] = None
    if len(devices) >= 1 and devices != ["cpu"]:
        autotune_device = devices[0]
    if len(devices) > 1:
        logger.info(
            f"Multi-GPU detected ({len(devices)} devices); autotuning on "
            f"{autotune_device} and applying the chosen batch to all devices."
        )

    # Dictionary to store the records
    cds_dict = {}


    with open_protein_fasta_file(input_path) as handle:  # handles gzip too
        records = list(SeqIO.parse(handle, "fasta"))
        if not records:
            logger.warning(f"No proteins were found in your input file {input_path}.")
            logger.error(
                f"Your input file {input_path} is likely not a amino acid FASTA file. Please check this."
            )
        for record in records:
            prot_id = record.id
            feature_location = FeatureLocation(0, len(record.seq))
            # Seq needs to be saved as the first element in list hence the closed brackets [str(record.seq)]
            seq_feature = SeqFeature(
                feature_location,
                type="CDS",
                qualifiers={
                    "ID": record.id,
                    "description": record.description,
                    "translation": str(record.seq),
                },
            )

            cds_dict[prot_id] = seq_feature

    if not cds_dict:
        logger.error(f"Error: no AA protein sequences found in {input_path} file")


    seqs = []
    for feat in cds_dict.values():
        v = feat.qualifiers.get("translation")
        if v and isinstance(v, str):
            seqs.append(v)

    logger.info("Beginning batch size tuning")
    logger.info(f"Using minimum batch size of 1 and maximum batch size of {max_batch}")

    # define the sampling

    probe_seqs = sample_probe_sequences(seqs, n=sample_seqs)

    batch_size, max_residues = autotune_batching_real_data(
        model_dir,
        model_name,
        cpu,
        threads,
        probe_seqs,
        start_bs=min_batch,
        max_bs=max_batch,
        step=step, # step size
        device=autotune_device,
    )

    logger.info(f"Optimal batch size is {batch_size} (residues per batch {max_residues})")

    return batch_size

sample_probe_sequences(seqs, n=5000, seed=0)

samples sequences

Source code in src/baktfold/features/autotune.py
def sample_probe_sequences(seqs, n=5000, seed=0):
    """
    samples sequences 

    """

    rng = random.Random(seed)

    if n >= len(seqs):
        sampled = list(seqs)
    else:
        sampled = rng.sample(seqs, n)

    # sort by sequence length
    sampled.sort(key=len, reverse=True)

    return sampled

annotate_aa(aas)

Combines IPS and PSC annotations and marks hypotheticals.

Parameters:

Name Type Description Default
aas Sequence[dict]

A sequence of amino acid dictionaries to annotate.

required

Returns:

Type Description

None

Examples:

>>> aas = [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
>>> annotate_aa(aas)
>>> aas
[{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
Source code in src/baktfold/bakta/annotation.py
def annotate_aa(aas: Sequence[dict]):
    """
    Combines IPS and PSC annotations and marks hypotheticals.

    Args:
      aas (Sequence[dict]): A sequence of amino acid dictionaries to annotate.

    Returns:
      None

    Examples:
      >>> aas = [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
      >>> annotate_aa(aas)
      >>> aas
      [{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}]
    """

    print('\tcombine annotations and mark hypotheticals...')

    for aa in aas:
        print(aa)
        combine_annotation(aa)  # combine IPS & PSC annotations and mark hypothetical
    logger.debug('analyze hypotheticals')
    hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    if(len(hypotheticals) > 0):
        print(f'\tanalyze hypothetical proteins: {len(hypotheticals)}')
        print('\tcalculated proteins statistics')

calc_annotation_score(orf)

Calculates the annotation score for a given ORF.

Parameters:

Name Type Description Default
orf dict

The ORF to calculate the annotation score for.

required

Returns:

Name Type Description
int int

The annotation score for the given ORF.

Examples:

>>> calc_annotation_score(orf)
Source code in src/baktfold/bakta/annotation.py
def calc_annotation_score(orf:dict) -> int:
    """
    Calculates the annotation score for a given ORF.

    Args:
      orf (dict): The ORF to calculate the annotation score for.

    Returns:
      int: The annotation score for the given ORF.

    Examples:
      >>> calc_annotation_score(orf)
    """
    score = 0
    if(orf.get('gene', None)):
        score += 1
    if(orf.get('product', None)):
        score += 1
    return score

combine_annotation(feature, fast)

Combines annotation information from different sources into a single feature.

Parameters:

Name Type Description Default
feature dict

The feature to combine annotation for.

required
fast bool

If True, skips AFDB

required

Returns:

Type Description

None

Examples:

>>> combine_annotation(feature)
Source code in src/baktfold/bakta/annotation.py
def combine_annotation(feature: dict, fast: bool):
    """
    Combines annotation information from different sources into a single feature.

    Args:
      feature (dict): The feature to combine annotation for.
      fast (bool): If True, skips AFDB
    Returns:
      None

    Examples:
      >>> combine_annotation(feature)
    """


    # ups = feature.get('ups', None)
    # ips = feature.get('ips', None)
    # psc = feature.get('psc', None)
    # pscc = feature.get('pscc', None)
    pstc = feature.get('pstc', None)
    # expert_hits = feature.get('expert', [])

    # gene = None
    # genes = set()
    # product = None

    product = feature.get('product', None)
    db_xrefs = feature.get('db_xrefs', [])

    if(pstc):

        # Always normalize pstc to a list
        if isinstance(pstc, dict):
            pstc = [pstc]
        elif isinstance(pstc, str):
            pstc = [pstc]

        # afdb
        afdb_entry = None if fast else next(
            (p for p in pstc if isinstance(p, dict) and p.get('source') == 'afdb'),
            None
        )
        # swissprot
        swissprot_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'swissprot'), None)
        # pdb
        pdb_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'pdb'), None)
        # cath
        cath_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'cath'), None)
        # custom
        custom_entry = next((p for p in pstc if isinstance(p, dict) and p.get('source') == 'custom_db'), None)

        ####
        # hierarchy
        # if it exists, custom is at the top
        # custom
        # if not
        # 1. SwissProt
        # 2. AFDB
        # 3. PDB
        # 4. CATH
        ####

        if custom_entry:
            pstc_product = custom_entry['description'] 
        elif swissprot_entry:
            pstc_product = swissprot_entry['description']
        elif afdb_entry:
            pstc_product = afdb_entry['description'] 
        elif pdb_entry:
            pstc_product = pdb_entry['description'] 
        elif cath_entry:
            pstc_product = cath_entry['description'] 
        else:
            pstc_product = None

        if(pstc_product):
            product = pstc_product

        # Collect all db_xref IDs
        for entry in pstc:
            if isinstance(entry, dict):
                src = entry.get('source', '').lower()
                eid = entry.get('id')
                if eid:
                    if src == 'afdb':
                        if not fast:
                            db_xrefs.append(f"afdb_v6:afdbclusters_{eid}")
                    elif src == 'swissprot':
                        db_xrefs.append(f"afdb_v6:swissprot_{eid}")
                    elif src == 'pdb':
                        db_xrefs.append(f"pdb:pdb_{eid}")
                    elif src == 'cath':
                        db_xrefs.append(f"cath:cath_{eid}")
                    elif src == 'custom_db':
                        db_xrefs.append(f"custom:custom_{eid}")
                    else:
                        db_xrefs.append(eid)
            elif isinstance(entry, str):
                # Preserve any existing string cross-references
                db_xrefs.append(entry)

        # mark as baktfold
        mark_as_baktfold(feature)




    # if(len(expert_hits) > 0):
    #     top_expert_hit = sorted(expert_hits,key=lambda k: (k['rank'], k.get('score', 0), calc_annotation_score(k)), reverse=True)[0]
    #     expert_genes = top_expert_hit.get('gene', None)
    #     if(expert_genes):
    #         expert_genes = expert_genes.replace('/', ',').split(',')
    #         genes.update(expert_genes)
    #         gene = expert_genes[0]
    #     product = top_expert_hit.get('product', None)
    #     for hit in expert_hits:
    #         db_xrefs.update(hit.get('db_xrefs', []))

    if product and "hypothetical protein" not in product.lower():
        product = revise_cds_product(product)
        if(product):
            if(cfg.compliant):
                product = insdc.revise_product_insdc(product)
            feature['product'] = product

            unmark_as_hypothetical(feature)

            # protein_gene_symbol = extract_protein_gene_symbol(product)
            # if(protein_gene_symbol):
            #     genes.add(protein_gene_symbol)
            # revised_genes = revise_cds_gene_symbols(genes)
            # revised_gene = None
            # if gene is not None:
            #     revised_gene = revise_cds_gene_symbols([gene])  # special treatment for selected gene symbol
            #     revised_gene = revised_gene[0] if len(revised_gene) > 0 else None
            # if(revised_gene is None  and  len(revised_genes) >= 1):  # select first from gene symbol list if no symbol was selected before
            #     revised_gene = revised_genes[0]

            # feature['gene'] = revised_gene
            # feature['genes'] = sorted(revised_genes)
        else:
            mark_as_hypothetical(feature)
    else:
        mark_as_hypothetical(feature)

    feature['db_xrefs'] = sorted(list(db_xrefs))

extract_protein_gene_symbol(product)

Extracts a valid gene symbol from a protein name.

Parameters:

Name Type Description Default
product str

The protein name to extract a gene symbol from.

required

Returns:

Name Type Description
str str

The extracted gene symbol.

Examples:

>>> extract_protein_gene_symbol(product)
Source code in src/baktfold/bakta/annotation.py
def extract_protein_gene_symbol(product: str) -> str:
    """
    Extracts a valid gene symbol from a protein name.

    Args:
      product (str): The protein name to extract a gene symbol from.

    Returns:
      str: The extracted gene symbol.

    Examples:
      >>> extract_protein_gene_symbol(product)
    """
    gene_symbols = []
    for part in product.split(' '):  # try to extract valid gene symbols
        m = RE_GENE_SYMBOL.fullmatch(part)
        if(m):
            symbol = m[0]
            logger.info('fix gene: extract symbol from protein name. symbol=%s', symbol)
            gene_symbols.append(symbol)
        else:
            m = RE_PROTEIN_SYMBOL.fullmatch(part)  # extract protein names
            if(m):
                symbol = m[0]
                symbol = symbol[0].lower() + symbol[1:]
                logger.info('fix gene: extract symbol from protein name. symbol=%s', symbol)
                gene_symbols.append(symbol)
    if(len(gene_symbols) == 0):  # None found
        return None
    elif(len(gene_symbols) == 1):  # found 1
        return gene_symbols[0]
    else:  # found more than one, take the 2nd as the 1st often describes a broader gene family like "xyz family trancsriptional regulator ..."
        return gene_symbols[1]

mark_as_baktfold(feature)

Adds the baktfold key to the given feature dictionary.

Parameters:

Name Type Description Default
feature dict

The feature dictionary to add the baktfold key to.

required

Returns:

Type Description

None

Examples:

>>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
>>> mark_as_baktfold(feature)
>>> feature
{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+', 'baktfold': True}
Source code in src/baktfold/bakta/annotation.py
def mark_as_baktfold(feature: dict):
    """
    Adds the baktfold key to the given feature dictionary.

    Args:
      feature (dict): The feature dictionary to add the baktfold key to.

    Returns:
      None

    Examples:
      >>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
      >>> mark_as_baktfold(feature)
      >>> feature
      {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+', 'baktfold': True}
    """
    # logger.info(
    #     f'baktfold found hit(s) for: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature['baktfold'] = True

mark_as_hypothetical(feature)

Marks a feature as hypothetical.

Parameters:

Name Type Description Default
feature dict

The feature to mark as hypothetical.

required

Returns:

Type Description

None

Examples:

>>> mark_as_hypothetical(feature)
Source code in src/baktfold/bakta/annotation.py
def mark_as_hypothetical(feature: dict):
    """
    Marks a feature as hypothetical.

    Args:
      feature (dict): The feature to mark as hypothetical.

    Returns:
      None

    Examples:
      >>> mark_as_hypothetical(feature)
    """
    # no need to actually print this I think
    # logger.info(
    #     f'marked as hypothetical: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature['hypothetical'] = True
    feature['gene'] = None
    feature['genes'] = []
    feature['product'] = bc.HYPOTHETICAL_PROTEIN

revise_cds_gene_symbols(raw_genes)

Revises a list of gene symbols to ensure they are valid.

Parameters:

Name Type Description Default
raw_genes Sequence[str]

The list of gene symbols to revise.

required

Returns:

Name Type Description
list

The revised list of gene symbols.

Examples:

>>> revise_cds_gene_symbols(raw_genes)
Source code in src/baktfold/bakta/annotation.py
def revise_cds_gene_symbols(raw_genes: Sequence[str]):
    """
    Revises a list of gene symbols to ensure they are valid.

    Args:
      raw_genes (Sequence[str]): The list of gene symbols to revise.

    Returns:
      list: The revised list of gene symbols.

    Examples:
      >>> revise_cds_gene_symbols(raw_genes)
    """
    revised_genes = set()
    for gene in raw_genes:
        old_gene = gene
        if(RE_GENE_SUSPECT_CHARS.search(gene)):  # check for suspect characters -> remove gene symbol
            logger.info('fix gene: remove gene symbol containing suspect chars. old=%s', old_gene)
            continue

        old_gene = gene
        gene = gene.replace('gene', '')
        if(gene != old_gene):  # remove gene literal
            logger.info('fix gene: remove gene literal. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        if(gene[-1] == '-'):  # remove orphan hyphen
            gene = gene[:-1]
            logger.info('fix gene: remove orphan hypen. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        gene = RE_MULTIWHITESPACE.sub(' ', gene).strip()  # revise whitespaces
        if(gene != old_gene):
            logger.info('fix gene: revise whitespaces. new=%s, old=%s', gene, old_gene)

        old_gene = gene
        if(RE_GENE_CAPITALIZED.fullmatch(gene)):
            gene = gene[0].lower() + gene[1:]
            logger.info('fix gene: lowercase first char. new=%s, old=%s', gene, old_gene)

        if(len(gene) >= 3):
            if(len(gene) <= 12):
                revised_genes.add(gene)
            else:
                old_gene = gene
                gene = extract_protein_gene_symbol(gene)
                if(gene):
                    revised_genes.add(gene)
    return list(revised_genes)

revise_cds_product(product)

Revise product name for INSDC compliant submissions

Source code in src/baktfold/bakta/annotation.py
def revise_cds_product(product: str):
    """Revise product name for INSDC compliant submissions"""

    # from gb 
    # grep "Uncharacterized protein" AFDBClusters.tsv | wc -l
    #     805448

    if "Uncharacterized protein" in product:
        old_product = product
        product = "hypothetical protein"
        if product != old_product:
            logger.info(f'fix product: renamed uncharacterized protein as hypothetical. new={product}, old={old_product}')

    # from bakta

    old_product = product
    product = RE_PROTEIN_WEIGHT.sub(' ', product)  # remove protein weight in (k)Da
    if(product != old_product):
        logger.info('fix product: remove protein weight in (k)Da. new=%s, old=%s', product, old_product)

    old_product = product
    product = re.sub(RE_PROTEIN_PERIOD_SEPARATOR, r'\1-\2', product)  # replace separator periods
    if(product != old_product):
        logger.info('fix product: replace separator periods. new=%s, old=%s', product, old_product)

    old_product = product
    if(product[0] in RE_PROTEIN_SUSPECT_CHARS_BEGINNING):  # remove suspect first character
        product = product[1:]
        logger.info('fix product: replace invalid first character. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_SUSPECT_CHARS_DISCARD.sub('', product)  # remove suspect characters
    if(product != old_product):
        logger.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_SUSPECT_CHARS_REPLACE.sub(' ', product)  # replace suspect characters by single whitespace
    if(product != old_product):
        logger.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_WRONG_PRIMES.sub('\u0027', product)  # replace wrong prime characters with single quote (U+0027) (') according to https://www.ncbi.nlm.nih.gov/genome/doc/internatprot_nomenguide/
    if(product != old_product):
        logger.info('fix product: replace wrong prime characters. new=%s, old=%s', product, old_product)

    old_product = product
    product = product.replace('FOG:', '')  # remove FOG ids
    if(product != old_product):
        logger.info('fix product: replace FOG ids. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_REMNANT.sub('', product)  # remove 'Remnant of's
    if(product != old_product):
        logger.info('fix product: replace remnant ofs. new=%s, old=%s', product, old_product)

    old_product = product
    dufs = []  # replace DUF-containing products
    for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product):
        dufs.append(m.group(1).upper())
    if(len(dufs) >= 1):
        product = f"{' '.join(dufs)} domain{'s' if len(dufs) > 1 else ''}-containing protein"
        if(product != old_product):
            logger.info('fix product: revise DUF. new=%s, old=%s', product, old_product)

    old_product = product
    if('conserved' in product.lower()):  # replace conserved UPF proteins
        upfs = []
        for m in RE_UNCHARACTERIZED_PROTEIN_FAMILY.finditer(product):
            upfs.append(m.group(1).upper())
        if(len(upfs) >= 1):
            product = f"{' '.join(upfs)} protein"
            if(product != old_product):
                logger.info('fix product: revise UPF. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_HOMOLOG.sub('-like protein', product)  # replace Homologs
    if(product != old_product):
        if(product.count('protein') == 2):
            product = product.replace('protein', '', 1)  # remove former protein term if existing
        logger.info('fix product: replace Homolog. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_MULTIWHITESPACE.sub(' ', product).strip()  # revise whitespaces
    if(product != old_product):
        logger.info('fix product: revise whitespaces. new=%s, old=%s', product, old_product)

    old_product = product
    product = RE_PROTEIN_PUTATIVE.sub('putative', product)  # replace putative synonyms)
    if(product != old_product):
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(RE_PROTEIN_DOMAIN_CONTAINING.search(product)):  # replace domain name underscores in domain names
        product = product.replace('_', '-')
        if(product != old_product):
            logger.info('fix product: replace domain name underscores. new=%s, old=%s', product, old_product)

    old_product = product
    if(RE_PROTEIN_TMRNA.fullmatch(product)):
        product = ''
        logger.info('fix product: discard pure tmRNA product descriptions. new=%s, old=%s', product, old_product)

    old_product = product
    if(
        RE_PROTEIN_CONTIG.search(product) or  # protein containing 'sequence'
        RE_PROTEIN_NODE.search(product) or  # potential contig name (SPAdes)
        RE_PROTEIN_POTENTIAL_CONTIG_NAME.search(product) or  # potential contig name (SPAdes)
        RE_PROTEIN_NO_LETTERS.fullmatch(product)  # no letters -> set to Hypothetical
        ):  # remove suspect products and mark as hypothetical
        product = None
        logger.info('remove product: mark proteins with suspect products as hypothetical. old=%s', old_product)

    return product

unmark_as_hypothetical(feature)

Removes the hypothetical key from the given feature dictionary.

Parameters:

Name Type Description Default
feature dict

The feature dictionary to remove the hypothetical key from.

required

Returns:

Type Description

None

Examples:

>>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
>>> unmark_as_hypothetical(feature)
>>> feature
{'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
Source code in src/baktfold/bakta/annotation.py
def unmark_as_hypothetical(feature: dict):
    """
    Removes the hypothetical key from the given feature dictionary.

    Args:
      feature (dict): The feature dictionary to remove the hypothetical key from.

    Returns:
      None

    Examples:
      >>> feature = {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
      >>> unmark_as_hypothetical(feature)
      >>> feature
      {'sequence': 'ATG', 'start': 1, 'stop': 3, 'strand': '+'}
    """
    # logger.info(
    #     f'unmarked as hypothetical: seq={feature['sequence']}, start={feature['start']}, stop={feature['stop']}, strand={feature['strand']}'
    # )
    feature.pop('hypothetical', None)  # remove completely

check_content_size(file_name, file_path)

Checks if a file is empty.

Parameters:

Name Type Description Default
file_name str

The name of the file to check.

required
file_path Path

The path to the file to check.

required

Returns:

Type Description

None.

Examples:

>>> check_content_size('file', Path('path/to/file'))
None
Source code in src/baktfold/bakta/config.py
def check_content_size(file_name: str, file_path: Path):
    """
    Checks if a file is empty.

    Args:
      file_name (str): The name of the file to check.
      file_path (Path): The path to the file to check.

    Returns:
      None.

    Examples:
      >>> check_content_size('file', Path('path/to/file'))
      None
    """
    if(file_path.stat().st_size == 0):
        log.error('empty %s file! path=%s', file_name, file_path)
        sys.exit(f'ERROR: {file_name} file ({file_path}) is empty!')

check_db_path(args)

Checks the path to the database.

Parameters:

Name Type Description Default
args Namespace

The arguments passed to the program.

required

Returns:

Name Type Description
Path Path

The path to the database.

Examples:

>>> check_db_path(args)
Path('path/to/db')
Source code in src/baktfold/bakta/config.py
def check_db_path(args: Namespace) -> Path:
    """
    Checks the path to the database.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the database.

    Examples:
      >>> check_db_path(args)
      Path('path/to/db')
    """
    global db_path
    env = os.environ.copy()
    if(args.db):
        db_dir = args.db
        log.debug('test parameter db: db_tmp=%s', db_dir)
        try:
            db_tmp_path = Path(db_dir).resolve()
            if(db_tmp_path.is_dir()):
                db_path = db_tmp_path
                log.info('database: type=parameter, path=%s', db_path)
            else:
                log.error('unvalid database path: type=parameter, path=%s', db_tmp_path)
                raise IOError()
        except:
            sys.exit(f'ERROR: wrong database path! --db={db_dir}')
    elif('BAKTA_DB' in env):
        db_dir = env['BAKTA_DB']
        log.debug('test env db: db_tmp=%s', db_dir)
        try:
            db_tmp_path = Path(db_dir).resolve()
            if(db_tmp_path.is_dir()):
                db_path = db_tmp_path
                log.info('database: type=environment, path=%s', db_path)
            else:
                log.error('unvalid database path: type=environment, path=%s', db_tmp_path)
                raise IOError()
        except:
            sys.exit(f'ERROR: wrong database path! BAKTA_DB={db_dir}')
    else:
        base_dir = Path(__file__).parent
        db_tmp_path = base_dir.joinpath('db')
        log.debug('test base_dir db: db_tmp=%s', db_tmp_path)
        if(db_tmp_path.is_dir()):
            db_path = db_tmp_path
            log.info('database: type=base-dir, path=%s', db_path)
        else:
            log.error('unvalid database path: type=base-dir, path=%s', db_tmp_path)
            sys.exit('ERROR: database neither provided nor auto-detected!\nPlease, download the mandatory db and provide it via either the --db parameter, a BAKTA_DB environment variable or copy it into the Bakta base directory.\nFor further information please read the readme.md')
    return db_path

check_output_path(output, force_override)

Check provided output path

Parameters:

Name Type Description Default
output string

The output directory destination path

required
force_override Bool

Whether to override existing output directories

required
Source code in src/baktfold/bakta/config.py
def check_output_path(output: str, force_override: bool) -> Path:
    """Check provided output path
    Args:
        output (string): The output directory destination path
        force_override (Bool): Whether to override existing output directories
    """
    global output_path
    output_path = Path(output)
    if(not output_path.exists()):
        try:
            output_path.mkdir(parents=True, exist_ok=True)
        except:
            sys.exit(f'ERROR: could not resolve or create output directory ({output})!')
    else:
        if(output_path == Path(os.getcwd())):
            pass
        elif(force_override is False):
            sys.exit(f'ERROR: output path ({output_path}) already exists! Either provide a non-existent new path or force overwriting it via \'--force\'')
        elif(not os.access(str(output_path), os.X_OK)):
            sys.exit(f'ERROR: output path ({output_path}) not accessible!')
        elif(not os.access(str(output_path), os.W_OK)):
            sys.exit(f'ERROR: output path ({output_path}) not writable!')
    output_path = output_path.resolve()
    return output_path

check_readability(file_name, file_Path)

Checks if a file is readable.

Parameters:

Name Type Description Default
file_name str

The name of the file to check.

required
file_Path Path

The path to the file to check.

required

Returns:

Type Description

None.

Examples:

>>> check_readability('file', Path('path/to/file'))
None
Source code in src/baktfold/bakta/config.py
def check_readability(file_name: str, file_Path: Path):
    """
    Checks if a file is readable.

    Args:
      file_name (str): The name of the file to check.
      file_Path (Path): The path to the file to check.

    Returns:
      None.

    Examples:
      >>> check_readability('file', Path('path/to/file'))
      None
    """
    if(not os.access(str(file_Path), os.R_OK)):
        log.error('%s file not readable! path=%s', file_name, file_Path)
        sys.exit(f'ERROR: {file_name} file ({file_Path}) not readable!')

check_threads(args)

Checks the number of threads to use.

Parameters:

Name Type Description Default
args Namespace

The arguments passed to the program.

required

Returns:

Name Type Description
int int

The number of threads to use.

Examples:

>>> check_threads(args)
4
Source code in src/baktfold/bakta/config.py
def check_threads(args: Namespace) -> int:
    """
    Checks the number of threads to use.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      int: The number of threads to use.

    Examples:
      >>> check_threads(args)
      4
    """
    global threads
    threads = args.threads

    try:
        max_threads = len(os.sched_getaffinity(0))
        log.debug(f"max-threads={max_threads}")
    except AttributeError:
        max_threads = mp.cpu_count()
        log.debug(f"scheduler affinity not availabe! max-threads={max_threads}")

    if(threads == 0):
        threads = max_threads
        log.debug("request max threads.")
    elif(threads < 0):
        log.error("wrong argument for 'threads' parameter! threads=%i", threads)
        sys.exit(f"ERROR: wrong argument ({threads}) for 'threads' parameter! Value must be larger than/equal to 0.")
    elif(threads > max_threads):
        log.error("wrong argument for 'threads' parameter! More threads requested than available: requested=%i, available=%i", threads, max_threads)
        sys.exit(f"ERROR: wrong argument ({threads}) for 'threads' parameter! More threads requested ({threads}) than available ({max_threads}).")
    log.info('threads=%i', threads)
    return threads

check_tmp_path(args)

Checks the path to the temporary directory.

Parameters:

Name Type Description Default
args Namespace

The arguments passed to the program.

required

Returns:

Name Type Description
Path Path

The path to the temporary directory.

Examples:

>>> check_tmp_path(args)
Path('path/to/tmp_dir')
Source code in src/baktfold/bakta/config.py
def check_tmp_path(args: Namespace) -> Path:
    """
    Checks the path to the temporary directory.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the temporary directory.

    Examples:
      >>> check_tmp_path(args)
      Path('path/to/tmp_dir')
    """
    global tmp_path
    if(args.tmp_dir is not None):
        tmp_path = Path(args.tmp_dir)
        if(not tmp_path.exists()):
            log.debug('dedicated temp dir does not exist! tmp-dir=%s', tmp_path)
            sys.exit(f'ERROR: dedicated temporary directory ({tmp_path}) does not exist!')
        else:
            log.info('use dedicated temp dir: path=%s', tmp_path)
            tmp_path = Path(tempfile.mkdtemp(dir=str(tmp_path))).resolve()
    else:
        tmp_path = Path(tempfile.mkdtemp()).resolve()
    log.info('tmp-path=%s', tmp_path)
    return tmp_path

check_user_proteins(args)

Checks the path to the user proteins file.

Parameters:

Name Type Description Default
args Namespace

The arguments passed to the program.

required

Returns:

Name Type Description
Path

The path to the user proteins file.

Examples:

>>> check_user_proteins(args)
Path('path/to/user_proteins')
Source code in src/baktfold/bakta/config.py
def check_user_proteins(args: Namespace):
    """
    Checks the path to the user proteins file.

    Args:
      args (Namespace): The arguments passed to the program.

    Returns:
      Path: The path to the user proteins file.

    Examples:
      >>> check_user_proteins(args)
      Path('path/to/user_proteins')
    """
    global user_proteins
    user_proteins = args.proteins
    if(user_proteins is not None):
        try:
            if(user_proteins == ''):
                raise ValueError('File path argument must be non-empty')
            user_proteins_path = Path(args.proteins).resolve()
            check_readability('user proteins', user_proteins_path)
            check_content_size('user proteins', user_proteins_path)
            user_proteins = user_proteins_path
            log.info('user-proteins=%s', user_proteins)
            return user_proteins
        except:
            log.error('provided user proteins file not valid! path=%s', user_proteins)
            sys.exit(f'ERROR: user proteins file ({user_proteins}) not valid!')
    else:
        return None

setup(args)

Test environment and build a runtime configuration.

Source code in src/baktfold/bakta/config.py
def setup(args):
    """Test environment and build a runtime configuration."""
    # runtime configurations
    global env, threads, verbose, debug
    env['BLAST_USAGE_REPORT'] = 'false'  # prevent BLAST from contacting NCBI

    threads = check_threads(args)
    verbose = args.verbose
    log.info('verbose=%s', verbose)
    debug = args.debug
    log.info('debug=%s', debug)
    if(debug):
        verbose = True

    # input / output path configurations
    global db_path, db_info, tmp_path, genome_path, min_sequence_length, prefix, output_path, force
    db_path = check_db_path(args)
    tmp_path = check_tmp_path(args)

    try:
        if(args.genome == ''):
            raise ValueError('File path argument must be non-empty')
        genome_path = Path(args.genome).resolve()
        check_readability('genome', genome_path)
        check_content_size('genome', genome_path)
    except:
        log.error('provided genome file not valid! path=%s', args.genome)
        sys.exit(f'ERROR: genome file ({args.genome}) not valid!')
    log.info('genome-path=%s', genome_path)

    # input / output configurations
    min_sequence_length = args.min_contig_length
    if(min_sequence_length <= 0):
        log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_sequence_length)
        sys.exit(f"ERROR: wrong argument ({min_sequence_length}) for 'min- contig-length' parameter! Value must be larger than 0")
    log.info('min_contig_length=%s', min_sequence_length)
    log.info('prefix=%s', prefix)  # set in main.py before global logger config
    log.info('output-path=%s', output_path)
    force = args.force
    log.info('force=%s', force)

    # organism configurations
    global genus, species, strain, plasmid, taxon
    genus = args.genus
    if(genus is not None):
        genus = genus.strip()
        if(genus == ''):
            log.error("Empty 'genus' parameter! genus=%s", genus)
            sys.exit(f"ERROR: empty 'genus' parameter!")
        else:
            genus = genus.capitalize()
    log.info('genus=%s', genus)
    species = args.species
    if(species is not None):
        species = species.strip()
        if(species == ''):
            log.error("Empty 'species' parameter! species=%s", species)
            sys.exit(f"ERROR: empty 'species' parameter!")
        else:
            species = species.lower()
    log.info('species=%s', species)
    strain = args.strain
    if(strain is not None):
        strain = strain.strip()
        if(strain == ''):
            log.error("Empty 'strain' parameter! strain=%s", species)
            sys.exit(f"ERROR: empty 'strain' parameter!")
    log.info('strain=%s', strain)
    plasmid = args.plasmid
    if(plasmid is not None):
        plasmid = plasmid.strip()
        if(plasmid == ''):
            log.error("Empty 'plasmid' parameter! plasmid=%s", plasmid)
            sys.exit(f"ERROR: empty 'plasmid' parameter!")
        elif('plasmid' in plasmid.lower()):
            log.error("Wrong 'plasmid' parameter! plasmid=%s", plasmid)
            sys.exit(f"ERROR: wrong 'plasmid' parameter! The plasmid name mustn't contain the word 'plasmid'.")
        elif(PLASMID_NAME_PATTERN.fullmatch(plasmid) is None and PLASMID_UNNAMED_PATTERN.fullmatch(plasmid) is None):
            log.error("Wrong 'plasmid' name! plasmid=%s", plasmid)
            sys.exit(f"ERROR: wrong 'plasmid' name! Plasmid names must either be named as 'unnamed', 'unnamed1', ... or start with a lower 'p', contain only digits, dots, underscores and letters, and are limited to 20 characters in total.")
    log.info('plasmid=%s', plasmid)
    taxon = ' '.join([t for t in [genus, species, strain] if t is not None])
    if(taxon == ''):
        taxon = None

    # annotation configurations
    global complete, prodigal_tf, translation_table, keep_sequence_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
    complete = args.complete
    log.info('complete=%s', complete)
    prodigal_tf = args.prodigal_tf
    if(prodigal_tf is not None):
        try:
            if(prodigal_tf == ''):
                raise ValueError('File path argument must be non-empty')
            prodigal_tf_path = Path(args.prodigal_tf).resolve()
            check_readability('prodigal training', prodigal_tf_path)
            check_content_size('prodigal training', prodigal_tf_path)
            prodigal_tf = prodigal_tf_path
        except:
            log.error('provided prodigal training file not valid! path=%s', prodigal_tf)
            sys.exit(f'ERROR: Prodigal training file ({prodigal_tf}) not valid!')
    log.info('prodigal_tf=%s', prodigal_tf)
    translation_table = args.translation_table
    log.info('translation_table=%s', translation_table)
    gram = args.gram
    log.info('gram=%s', gram)
    compliant = args.compliant
    log.info('compliant=%s', compliant)
    if(compliant):
        min_sequence_length = 200
        log.info('compliant mode! min_contig_length=%s', min_sequence_length)
    meta = args.meta
    log.info('meta=%s', meta)
    locus = args.locus
    if(locus is not None):
        if(locus == ''):
            log.error("Empty 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: empty 'locus' parameter!")
        if(' ' in locus):
            log.error("Whitespace character in 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: whitespace character ({locus}) in 'locus' parameter!")
        if(bc.RE_INSDC_ID_PREFIX.fullmatch(locus) is None):
            log.error("Invalid 'locus' parameter! locus=%s", locus)
            sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric or '-_' characters.")
    log.info('locus=%s', locus)
    locus_tag = args.locus_tag
    if(locus_tag is not None):
        if(locus_tag == ''):
            log.error("Empty 'locus-tag' parameter! locus=%s", locus_tag)
            sys.exit(f"ERROR: empty 'locus-tag' parameter!")
        if(' ' in locus_tag):
            log.error("Whitespace character in 'locus-tag' parameter! locus-tag=%s", locus_tag)
            sys.exit(f"ERROR: whitespace character ({locus_tag}) in 'locus-tag' parameter!")
        if(compliant):
            if(bc.RE_INSDC_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
                log.error("INSDC-incompliant 'locus-tag' parameter! locus-tag=%s", locus_tag)
                sys.exit(f"ERROR: INSDC-incompliant 'locus-tag' parameter ({locus_tag})!\nINSDC Locus tag prefixes must contain between 3 and 12 alphanumeric uppercase characters and start with a letter.")
        else:
            if(bc.RE_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
                log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
                sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.")
    log.info('locus-tag=%s', locus_tag)
    locus_tag_increment = args.locus_tag_increment
    log.info('locus-tag-increment=%s', locus_tag_increment)
    keep_sequence_headers = args.keep_contig_headers
    log.info('keep_contig_headers=%s', keep_sequence_headers)
    replicons = args.replicons
    if(replicons is not None):
        try:
            if(replicons == ''):
                raise ValueError('File path argument must be non-empty')
            replicon_table_path = Path(args.replicons).resolve()
            check_readability('replicon table', replicon_table_path)
            check_content_size('replicon table', replicon_table_path)
            replicons = replicon_table_path
        except:
            log.error('provided replicon file not valid! path=%s', replicons)
            sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
    log.info('replicon-table=%s', replicons)
    user_proteins = check_user_proteins(args)
    user_hmms = args.hmms
    if(user_hmms is not None):
        try:
            if(user_hmms == ''):
                raise ValueError('File path argument must be non-empty')
            user_hmms_path = Path(user_hmms).resolve()
            check_readability('HMM', user_hmms_path)
            check_content_size('HMM', user_hmms_path)
            user_hmms = user_hmms_path
        except:
            log.error('provided HMM file not valid! path=%s', user_hmms)
            sys.exit(f'ERROR: HMM file ({user_hmms}) not valid!')

    regions = args.regions
    if(regions is not None):
        try:
            if(regions == ''):
                raise ValueError('File path argument must be non-empty')
            regions_path = Path(args.regions).resolve()
            check_readability('regions', regions_path)
            check_content_size('regions', regions_path)
            regions = regions_path
        except:
            log.error('provided regions file not valid! path=%s', regions)
            sys.exit(f'ERROR: regions file ({regions}) not valid!')
    log.info('regions=%s', regions)


    # workflow configurations
    global skip_trna, skip_tmrna, skip_rrna, skip_ncrna, skip_ncrna_region, skip_crispr, skip_cds, skip_pseudo, skip_sorf, skip_gap, skip_ori, skip_filter, skip_plot
    skip_trna = args.skip_trna
    log.info('skip-tRNA=%s', skip_trna)
    skip_tmrna = args.skip_tmrna
    log.info('skip-tmRNA=%s', skip_tmrna)
    skip_rrna = args.skip_rrna
    log.info('skip-rRNA=%s', skip_rrna)
    skip_ncrna = args.skip_ncrna
    log.info('skip-ncRNA=%s', skip_ncrna)
    skip_ncrna_region = args.skip_ncrna_region
    log.info('skip-ncRNA-region=%s', skip_ncrna_region)
    skip_crispr = args.skip_crispr
    log.info('skip-CRISPR=%s', skip_crispr)
    skip_cds = args.skip_cds
    log.info('skip-CDS=%s', skip_cds)
    skip_pseudo = args.skip_pseudo
    log.info('skip-pseudo=%s', skip_pseudo)
    skip_sorf = args.skip_sorf
    log.info('skip-sORF=%s', skip_sorf)
    skip_gap = args.skip_gap
    log.info('skip-gap=%s', skip_gap)
    skip_ori = args.skip_ori
    log.info('skip-ori=%s', skip_ori)
    skip_filter = args.skip_filter
    log.info('skip-filter=%s', skip_filter)
    skip_plot = args.skip_plot
    log.info('skip-plot=%s', skip_plot)

fetch_db_pscc_result(conn, uniref50_id)

Fetches the PSCC result for a given uniref50_id from a sqlite3 database.

Parameters:

Name Type Description Default
conn sqlite3.Connection

The connection to the sqlite3 database.

required
uniref50_id str

The uniref50_id to fetch the PSCC result for.

required

Returns:

Name Type Description
tuple

The PSCC result for the given uniref50_id.

Source code in src/baktfold/bakta/pstc.py
def fetch_db_pscc_result(conn: sqlite3.Connection, uniref50_id: str):
    """
    Fetches the PSCC result for a given uniref50_id from a sqlite3 database.

    Args:
      conn (sqlite3.Connection): The connection to the sqlite3 database.
      uniref50_id (str): The uniref50_id to fetch the PSCC result for.

    Returns:
      tuple: The PSCC result for the given uniref50_id.
    """
    c = conn.cursor()
    c.execute('select * from pscc where uniref50_id=?', (uniref50_id,))
    rec = c.fetchone()
    c.close()
    return rec

fetch_sql_description(conn, source, accession)

Fetches the product description for a given source and accession from a sqlite3 database.

Parameters:

Name Type Description Default
conn sqlite3.Connection

The connection to the sqlite3 database.

required
source str

The source of the accession.

required
accession str

The accession to fetch the description for.

required

Returns:

Name Type Description
str

The product description for the given source and accession.

Source code in src/baktfold/bakta/pstc.py
def fetch_sql_description(conn, source, accession):
    """
    Fetches the product description for a given source and accession from a sqlite3 database.

    Args:
      conn (sqlite3.Connection): The connection to the sqlite3 database.
      source (str): The source of the accession.
      accession (str): The accession to fetch the description for.

    Returns:
      str: The product description for the given source and accession.
    """
    table_map = {
        'swissprot': 'swissprot',
        'afdb': 'afdbclusters',
        'pdb': 'pdb',
        'cath': 'cath',
    }

    table = table_map.get(source)
    if table is None:
        return None

    # special case for cath, which can have multiple top hits (greedy) - multidomain proteins
    if table == 'cath':
        cursor = conn.execute("SELECT product FROM cath WHERE id = ?", (accession,))
    else:
        cursor = conn.execute(f"SELECT product FROM {table} WHERE id = ?", (accession,))

    row = cursor.fetchone()
    return row[0] if row else None

fetch_sql_description_threadsafe(db_path, source, accession)

makes new connection every time so don't have 2 CATH accessions colliding (for multi domain proteins)

Source code in src/baktfold/bakta/pstc.py
def fetch_sql_description_threadsafe(db_path, source, accession):
    """
    makes new connection every time so don't have 2 CATH accessions colliding (for multi domain proteins)
    """
    import sqlite3
    conn = sqlite3.connect(db_path, uri=True, check_same_thread=False)
    try:
        result = fetch_sql_description(conn, source, accession)
    finally:
        conn.close()
    return result

lookup_custom(features, baktfold_db, custom_annotations)

Lookup PSTC information from custom db

Source code in src/baktfold/bakta/pstc.py
def lookup_custom(features: Sequence[dict], baktfold_db: Path, custom_annotations: Path):
    """Lookup PSTC information from custom db """
    no_pstc_lookups = 0

    # custom
    if custom_annotations:
        custom_dict = {}
        with open(f"{custom_annotations}", "r") as f:
            reader = csv.reader(f, delimiter="\t")
            for row in reader:
                if len(row) >= 2:
                    custom_dict[row[0]] = row[1]

    for feat in features:
        pstc = feat.get('pstc')
        if not pstc:
            continue

        # Normalize to list for consistent handling
        pstc_entries = pstc if isinstance(pstc, list) else [pstc]

        for entry in pstc_entries:
            accession = entry.get('id')
            source = entry.get('source')
            if source == 'custom_db':
                if accession in custom_dict:
                    entry['description'] = custom_dict[accession]
                else:
                    entry['description'] = accession # mark as accession if no annotation given for custom for now

        # Write back normalized list or single entry
        feat['pstc'] = pstc_entries if isinstance(pstc, list) else pstc_entries[0]

    return features

lookup_sql(features, baktfold_db, threads)

Lookup PSTC information

Source code in src/baktfold/bakta/pstc.py
def lookup_sql(features: Sequence[dict], baktfold_db: Path, threads: int):
    """Lookup PSTC information"""

    no_pstc_lookups = 0
    # try:
    rec_futures = []
    logger.info("Looking up PSTC descriptions")
    # with sqlite3.connect(f"file:{baktfold_db.joinpath('baktfold.db')}?mode=ro&nolock=1&cache=shared", uri=True, check_same_thread=False) as conn:
    #     conn.execute('PRAGMA omit_readlock;')
    #     conn.row_factory = sqlite3.Row
    with ThreadPoolExecutor(max_workers=max(10, threads)) as tpe:  # use min 10 threads for IO bound non-CPU lookups
        for feat in features:
            pstc = feat.get('pstc')
            if not pstc:
                continue

            # Normalize to list for consistent handling
            pstc_entries = pstc if isinstance(pstc, list) else [pstc]

            rec_futures = []
            for entry in pstc_entries:
                accession = entry.get('id')

                source = entry.get('source')

                # submit database query as a future
                future = tpe.submit(fetch_sql_description_threadsafe, baktfold_db.joinpath('baktfold.db'), source, accession)
                rec_futures.append((entry, future))


            # Collect results
            for entry, future in rec_futures:
                desc = future.result()
                if desc:
                    entry['description'] = desc
                else:
                    if entry.get('source') == 'custom_db':
                        entry['description'] = accession  # keep accession if custom_db but missing
                    else:
                        entry['description'] = "hypothetical protein"

            # Write back normalized list or single entry
            feat['pstc'] = pstc_entries if isinstance(pstc, list) else pstc_entries[0]

    # except Exception as ex:
    #     logger.error('Could not read PSTCs from db!')
    #     raise Exception('SQL error!', ex)
    # log.info('looked-up=%i', no_pstc_lookups)

    return features

parse(features, foldseek_df, db_name='swissprot', has_duplicate_locus=False)

Update CDS in place with PSTC hits from foldseek_df if they pass filters.

has_duplicate_locus - some euks have multiple CDS per locus tag

Source code in src/baktfold/bakta/pstc.py
def parse(features: Sequence[dict], foldseek_df: pd.DataFrame, db_name: str = 'swissprot', has_duplicate_locus: bool = False) -> None:
    """Update CDS in place with PSTC hits from foldseek_df if they pass filters.

    has_duplicate_locus - some euks have multiple CDS per locus tag

    """ 

    if foldseek_df.empty:
        return features

    # Convert foldseek_df to a lookup table keyed by query ID
    foldseek_hits = {row['query']: row for _, row in foldseek_df.iterrows()}

    # each query maps to a list of rows now (to handle multiple CATH greedy tophits for multidomain proteins)
    foldseek_hits = defaultdict(list)
    for _, row in foldseek_df.iterrows():
        foldseek_hits[row['query']].append(row)

    updated_count = 0


    for cds in features:
        if has_duplicate_locus:
            aa_identifier = cds.get('id')
        else:
            aa_identifier = cds.get('locus')

        if aa_identifier not in foldseek_hits:
            continue  # no hits, skip

        cds_updated = False  

        # Iterate over *all* hits for this query
        for row in foldseek_hits[aa_identifier]:
            query_cov = float(row['qCov'])
            subject_cov = float(row['tCov'])
            identity = float(row['fident'])
            evalue = float(row['evalue'])
            bitscore = float(row['bitscore'])
            target_id = row['target']

            # Extract accession depending on database
            if db_name in {"swissprot", "afdb"}:
                accession = target_id.split('-')[1]
            elif db_name == "pdb":
                accession = target_id.split('-')[0]
            else:  # cath and custom
                accession = target_id

            # Apply your filters
            if (
                query_cov >= bc.MIN_PSTC_QCOVERAGE
                and subject_cov >= bc.MIN_PSTC_TCOVERAGE
                and identity >= bc.MIN_PSTC_IDENTITY
            ):
                new_pstc = {
                    'source': db_name,
                    'id': accession,
                    'query_cov': query_cov,
                    'subject_cov': subject_cov,
                    'identity': identity,
                    'score': bitscore,
                    'evalue': evalue,
                }

                # Append or initialize 'pstc'
                if 'pstc' in cds:
                    if isinstance(cds['pstc'], dict):
                        cds['pstc'] = [cds['pstc'], new_pstc]
                    elif isinstance(cds['pstc'], list):
                        cds['pstc'].append(new_pstc)
                    else:
                        cds['pstc'] = [new_pstc]
                else:
                    cds['pstc'] = [new_pstc]  # ← ensure list, since we may have many hits


                cds_updated = True  

        # Increment only once per CDS that had at least one valid hit (CATH might have multiple)
        if cds_updated:
            updated_count += 1

    logger.info(f"PSTC for {db_name} updated in place for {updated_count} CDSs")
    return features

Originally taken from Michael Hall's tbpore https://github.com/mbhall88/tbpore/blob/main/tbpore/external_tools.py

Also used by a variety of other tools (Dnaapler, Plassembler, Pharokka)

ExternalTool

Class for running external tools.

Parameters:

Name Type Description Default
tool str

The path to the tool to run.

required
input str

The input file.

required
output str

The output file.

required
params str

The parameters to pass to the tool.

required
logdir Path

The directory to store log files.

required

Attributes:

Name Type Description
command List[str]

The command to run.

out_log str

The path to the stdout log file.

err_log str

The path to the stderr log file.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"
Source code in src/baktfold/utils/external_tools.py
class ExternalTool:
    """
    Class for running external tools.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    def __init__(
        self,
        tool: str,
        input: str,
        output: str,
        params: str,
        logdir: Path,
        env: Optional[Dict[str, str]] = None,
    ):
        """
        Initializes an ExternalTool object.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.
          logdir (Path): The directory to store log files.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
            for multi-GPU foldseek). None == inherit unchanged.

        Attributes:
          command (List[str]): The command to run.
          out_log (str): The path to the stdout log file.
          err_log (str): The path to the stderr log file.
          env (Optional[Dict[str, str]]): Extra subprocess env vars.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command
          ["tool", "params", "output", "input"]
          >>> tool.out_log
          "logdir/tool_1234567890abcdef1234567890abcdef.out"
          >>> tool.err_log
          "logdir/tool_1234567890abcdef1234567890abcdef.err"
        """
        logdir = Path(logdir)
        self.command: List[str] = self._build_command(tool, input, output, params)
        Path(logdir).mkdir(parents=True, exist_ok=True)
        command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
        tool_name = Path(tool).name
        logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
        self.out_log = f"{logfile_prefix}.out"
        self.err_log = f"{logfile_prefix}.err"
        self.env = env

    @property
    def command_as_str(self) -> str:
        """
        Returns the command as a string.

        Returns:
          str: The command as a string.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.command_as_str
          "tool params output input"
        """
        return shlex.join(self.command)

    @staticmethod
    def _build_command(tool: str, input: str, output: str, params: str) -> List[str]:
        """
        Builds the command to run.

        Args:
          tool (str): The path to the tool to run.
          input (str): The input file.
          output (str): The output file.
          params (str): The parameters to pass to the tool.

        Returns:
          List[str]: The command to run.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._build_command("tool", "input", "output", "params")
          ["tool", "params", "output", "input"]
        """
        # note: shlex.join does not allow us to shlex.split() later
        # this is explicitly a " ".join()
        command = " ".join([tool, params, output, input])
        escaped_command = shlex.split(command)
        return escaped_command

    def run(self) -> None:
        """
        Runs the tool.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            if self.env:
                print(f"Extra env: {self.env}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")
            self._run_core(
                self.command,
                stdout_fh=stdout_fh,
                stderr_fh=stderr_fh,
                env=self.env,
            )
            logger.info(f"Done running {self.command_as_str}")

    """
    stream to terminal (aria2c) so the user knows how long it is taking
    """

    def run_stream(self) -> None:
        """
        Runs the tool and streams the output to the terminal.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool.run_stream()
        """
        with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
            print(f"Command line: {self.command_as_str}", file=stderr_fh)
            logger.info(f"Started running {self.command_as_str} ...")

            process = subprocess.Popen(
                self.command,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                bufsize=1,
                universal_newlines=True,
            )

            for line in process.stdout:
                print(line, end="")         # Live output to terminal
                stdout_fh.write(line)       # Also write to stdout log

            process.stdout.close()
            return_code = process.wait()

            logger.info(f"Done running {self.command_as_str}")

            if return_code != 0:
                raise subprocess.CalledProcessError(return_code, self.command)


    @staticmethod
    def _run_core(
        command: List[str],
        stdout_fh,
        stderr_fh,
        env: Optional[Dict[str, str]] = None,
    ) -> None:
        """
        Runs the tool.

        Args:
          command (List[str]): The command to run.
          stdout_fh: The file handle to write stdout to.
          stderr_fh: The file handle to write stderr to.
          env (Optional[Dict[str, str]]): Extra env vars merged with
            os.environ for the subprocess. None == inherit.

        Examples:
          >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
          >>> tool._run_core(["tool", "params", "output", "input"], stdout_fh, stderr_fh)
        """
        merged_env = {**os.environ, **env} if env else None
        subprocess.check_call(
            command, stdout=stdout_fh, stderr=stderr_fh, env=merged_env
        )

    @staticmethod
    def run_tools(
        tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
    ) -> None:
        """
        Runs a list of tools.

        Args:
          tools_to_run (Tuple[ExternalTool]): The list of tools to run.
          ctx (Optional[click.Context]): The click context.

        Examples:
          >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
          >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
          >>> ExternalTool.run_tools((tool1, tool2))
          >>> ExternalTool.run_tools((tool1, tool2), ctx)
        """
        for tool in tools_to_run:
            try:
                tool.run()
            except subprocess.CalledProcessError as error:
                logger.error(
                    f"Error calling {tool.command_as_str} (return code {error.returncode})"
                )
                logger.error(f"Please check stdout log file: {tool.out_log}")
                logger.error(f"Please check stderr log file: {tool.err_log}")
                logger.error("Temporary files are preserved for debugging")
                logger.error("Exiting...")

                if ctx:
                    ctx.exit(1)
                else:
                    sys.exit(1)

    """
    Only one toolf
    """

    @staticmethod
    def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_tool(tool)
          None
        """
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)


    """
    Only download - so can print the aria2c output to screen
    """

    @staticmethod
    def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
        """
        Runs the given external tool and prints the aria2c output to the screen.

        Args:
          tool (ExternalTool): The external tool to run.
          ctx (Optional[click.Context]): The click context to use. Defaults to None.

        Returns:
          None.

        Raises:
          subprocess.CalledProcessError: If there is an error calling the external tool.

        Examples:
          >>> tool = ExternalTool()
          >>> ExternalTool.run_download(tool)
          None
        """
        try:
            tool.run_stream()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

command_as_str: str property

Returns the command as a string.

Returns:

Name Type Description
str str

The command as a string.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command_as_str
"tool params output input"

__init__(tool, input, output, params, logdir, env=None)

Initializes an ExternalTool object.

Parameters:

Name Type Description Default
tool str

The path to the tool to run.

required
input str

The input file.

required
output str

The output file.

required
params str

The parameters to pass to the tool.

required
logdir Path

The directory to store log files.

required
env Optional[Dict[str, str]]

Extra env vars merged with os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES for multi-GPU foldseek). None == inherit unchanged.

None

Attributes:

Name Type Description
command List[str]

The command to run.

out_log str

The path to the stdout log file.

err_log str

The path to the stderr log file.

env Optional[Dict[str, str]]

Extra subprocess env vars.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.command
["tool", "params", "output", "input"]
>>> tool.out_log
"logdir/tool_1234567890abcdef1234567890abcdef.out"
>>> tool.err_log
"logdir/tool_1234567890abcdef1234567890abcdef.err"
Source code in src/baktfold/utils/external_tools.py
def __init__(
    self,
    tool: str,
    input: str,
    output: str,
    params: str,
    logdir: Path,
    env: Optional[Dict[str, str]] = None,
):
    """
    Initializes an ExternalTool object.

    Args:
      tool (str): The path to the tool to run.
      input (str): The input file.
      output (str): The output file.
      params (str): The parameters to pass to the tool.
      logdir (Path): The directory to store log files.
      env (Optional[Dict[str, str]]): Extra env vars merged with
        os.environ for the subprocess (e.g. CUDA_VISIBLE_DEVICES
        for multi-GPU foldseek). None == inherit unchanged.

    Attributes:
      command (List[str]): The command to run.
      out_log (str): The path to the stdout log file.
      err_log (str): The path to the stderr log file.
      env (Optional[Dict[str, str]]): Extra subprocess env vars.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.command
      ["tool", "params", "output", "input"]
      >>> tool.out_log
      "logdir/tool_1234567890abcdef1234567890abcdef.out"
      >>> tool.err_log
      "logdir/tool_1234567890abcdef1234567890abcdef.err"
    """
    logdir = Path(logdir)
    self.command: List[str] = self._build_command(tool, input, output, params)
    Path(logdir).mkdir(parents=True, exist_ok=True)
    command_hash = hashlib.sha256(self.command_as_str.encode("utf-8")).hexdigest()
    tool_name = Path(tool).name
    logfile_prefix: Path = logdir / f"{tool_name}_{command_hash}"
    self.out_log = f"{logfile_prefix}.out"
    self.err_log = f"{logfile_prefix}.err"
    self.env = env

run()

Runs the tool.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run()
Source code in src/baktfold/utils/external_tools.py
def run(self) -> None:
    """
    Runs the tool.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        if self.env:
            print(f"Extra env: {self.env}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")
        self._run_core(
            self.command,
            stdout_fh=stdout_fh,
            stderr_fh=stderr_fh,
            env=self.env,
        )
        logger.info(f"Done running {self.command_as_str}")

run_download(tool, ctx=None) staticmethod

Runs the given external tool and prints the aria2c output to the screen.

Parameters:

Name Type Description Default
tool ExternalTool

The external tool to run.

required
ctx Optional[click.Context]

The click context to use. Defaults to None.

None

Returns:

Type Description
None

None.

Raises:

Type Description
subprocess.CalledProcessError

If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_download(tool)
None
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_download(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool and prints the aria2c output to the screen.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_download(tool)
      None
    """
    try:
        tool.run_stream()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

run_stream()

Runs the tool and streams the output to the terminal.

Examples:

>>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
>>> tool.run_stream()
Source code in src/baktfold/utils/external_tools.py
def run_stream(self) -> None:
    """
    Runs the tool and streams the output to the terminal.

    Examples:
      >>> tool = ExternalTool("tool", "input", "output", "params", "logdir")
      >>> tool.run_stream()
    """
    with open(self.out_log, "w") as stdout_fh, open(self.err_log, "w") as stderr_fh:
        print(f"Command line: {self.command_as_str}", file=stderr_fh)
        logger.info(f"Started running {self.command_as_str} ...")

        process = subprocess.Popen(
            self.command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True,
        )

        for line in process.stdout:
            print(line, end="")         # Live output to terminal
            stdout_fh.write(line)       # Also write to stdout log

        process.stdout.close()
        return_code = process.wait()

        logger.info(f"Done running {self.command_as_str}")

        if return_code != 0:
            raise subprocess.CalledProcessError(return_code, self.command)

run_tool(tool, ctx=None) staticmethod

Runs the given external tool.

Parameters:

Name Type Description Default
tool ExternalTool

The external tool to run.

required
ctx Optional[click.Context]

The click context to use. Defaults to None.

None

Returns:

Type Description
None

None.

Raises:

Type Description
subprocess.CalledProcessError

If there is an error calling the external tool.

Examples:

>>> tool = ExternalTool()
>>> ExternalTool.run_tool(tool)
None
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_tool(tool: "ExternalTool", ctx: Optional[click.Context] = None) -> None:
    """
    Runs the given external tool.

    Args:
      tool (ExternalTool): The external tool to run.
      ctx (Optional[click.Context]): The click context to use. Defaults to None.

    Returns:
      None.

    Raises:
      subprocess.CalledProcessError: If there is an error calling the external tool.

    Examples:
      >>> tool = ExternalTool()
      >>> ExternalTool.run_tool(tool)
      None
    """
    try:
        tool.run()
    except subprocess.CalledProcessError as error:
        logger.error(
            f"Error calling {tool.command_as_str} (return code {error.returncode})"
        )
        logger.error(f"Please check stdout log file: {tool.out_log}")
        logger.error(f"Please check stderr log file: {tool.err_log}")
        logger.error("Temporary files are preserved for debugging")
        logger.error("Exiting...")

        if ctx:
            ctx.exit(1)
        else:
            sys.exit(1)

run_tools(tools_to_run, ctx=None) staticmethod

Runs a list of tools.

Parameters:

Name Type Description Default
tools_to_run Tuple[ExternalTool]

The list of tools to run.

required
ctx Optional[click.Context]

The click context.

None

Examples:

>>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
>>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
>>> ExternalTool.run_tools((tool1, tool2))
>>> ExternalTool.run_tools((tool1, tool2), ctx)
Source code in src/baktfold/utils/external_tools.py
@staticmethod
def run_tools(
    tools_to_run: Tuple["ExternalTool", ...], ctx: Optional[click.Context] = None
) -> None:
    """
    Runs a list of tools.

    Args:
      tools_to_run (Tuple[ExternalTool]): The list of tools to run.
      ctx (Optional[click.Context]): The click context.

    Examples:
      >>> tool1 = ExternalTool("tool1", "input1", "output1", "params1", "logdir")
      >>> tool2 = ExternalTool("tool2", "input2", "output2", "params2", "logdir")
      >>> ExternalTool.run_tools((tool1, tool2))
      >>> ExternalTool.run_tools((tool1, tool2), ctx)
    """
    for tool in tools_to_run:
        try:
            tool.run()
        except subprocess.CalledProcessError as error:
            logger.error(
                f"Error calling {tool.command_as_str} (return code {error.returncode})"
            )
            logger.error(f"Please check stdout log file: {tool.out_log}")
            logger.error(f"Please check stderr log file: {tool.err_log}")
            logger.error("Temporary files are preserved for debugging")
            logger.error("Exiting...")

            if ctx:
                ctx.exit(1)
            else:
                sys.exit(1)

log_fmt = '[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] <level>{level: <8}</level> | <level>{message}</level>' module-attribute

begin and end functions

OrderedCommands

Bases: click.Group

This class will preserve the order of subcommands, which is useful when printing --help

Source code in src/baktfold/utils/util.py
class OrderedCommands(click.Group):
    """This class will preserve the order of subcommands, which is useful when printing --help"""

    def list_commands(self, ctx: click.Context):
        """
        Returns a list of subcommands in the order they were added.

        Args:
          ctx (click.Context): The click context.

        Returns:
          list: A list of subcommands in the order they were added.
        """
        return list(self.commands)

list_commands(ctx)

Returns a list of subcommands in the order they were added.

Parameters:

Name Type Description Default
ctx click.Context

The click context.

required

Returns:

Name Type Description
list

A list of subcommands in the order they were added.

Source code in src/baktfold/utils/util.py
def list_commands(self, ctx: click.Context):
    """
    Returns a list of subcommands in the order they were added.

    Args:
      ctx (click.Context): The click context.

    Returns:
      list: A list of subcommands in the order they were added.
    """
    return list(self.commands)

baktfold_base(rel_path)

Returns the absolute path to the given relative path.

Parameters:

Name Type Description Default
rel_path str

The relative path to the file.

required

Returns:

Name Type Description
str

The absolute path to the file.

Source code in src/baktfold/utils/util.py
def baktfold_base(rel_path):
    """
    Returns the absolute path to the given relative path.

    Args:
      rel_path (str): The relative path to the file.

    Returns:
      str: The absolute path to the file.
    """
    return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)

begin_baktfold(params, subcommand, no_log=False)

Begin baktfold process.

Parameters:

Name Type Description Default
params Dict[str, Any]

A dictionary of parameters for baktfold.

required
subcommand str

Subcommand indicating the baktfold operation.

required
no_log bool

No log file

False

Returns:

Name Type Description
int int

Start time of the baktfold process.

Source code in src/baktfold/utils/util.py
def begin_baktfold(params: Dict[str, Any], subcommand: str, no_log: bool = False) -> int:
    """
    Begin baktfold process.

    Parameters:
        params (Dict[str, Any]): A dictionary of parameters for baktfold.
        subcommand (str): Subcommand indicating the baktfold operation.
        no_log (bool): No log file

    Returns:
        int: Start time of the baktfold process.
    """
    # get start time
    start_time = time.time()

    cfg.run_start = datetime.now()

    # initial logging stuff
    if not no_log:
        log_file = os.path.join(params["--output"], f"baktfold_{subcommand}_{start_time}.log")
        # adds log file
        logger.add(log_file)
    logger.add(lambda _: sys.exit(1), level="ERROR")

    print_splash()
    logger.info("baktfold: rapid & standardized annotation of bacterial genomes, MAGs & plasmids using protein structural information")

    logger.info(f"You are using baktfold version {get_version()}")
    logger.info("Repository homepage is https://github.com/gbouras13/baktfold")
    logger.info(f"You are running baktfold {subcommand}")
    logger.info(f"Listing parameters")
    for key, value in params.items():
        logger.info(f"Parameter: {key} {value}")

    return start_time

clean_up_temporary_files(output, prefix)

Clean up temporary files generated during the baktfold process.

Parameters:

Name Type Description Default
output Path

Path to the output directory.

required
prefix str

prefix str

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def clean_up_temporary_files(output: Path, prefix: str) -> None:
    """
    Clean up temporary files generated during the baktfold process.

    Parameters:
        output (Path): Path to the output directory.
        prefix (str): prefix str


    Returns:
        None
    """

    baktfold_aa: Path = Path(output) / f"{prefix}_aa.fasta"
    result_tsv_swissprot: Path = Path(output) / "foldseek_results_swissprot.tsv"
    result_tsv_afdb: Path = Path(output) / "foldseek_results_afdb_clusters.tsv"
    result_tsv_pdb: Path = Path(output) / "foldseek_results_pdb.tsv"
    result_tsv_cath: Path = Path(output) / "foldseek_results_cath.tsv"
    result_tsv_custom: Path = Path(output) / "foldseek_results_custom.tsv"
    foldseek_db: Path = Path(output) / "foldseek_db"
    result_db_base: Path = Path(output) / "result_db"
    temp_db: Path = Path(output) / "temp_db"

    remove_directory(result_db_base)
    remove_directory(temp_db)
    remove_directory(foldseek_db)

    remove_file(baktfold_aa)
    remove_file(result_tsv_swissprot)
    remove_file(result_tsv_afdb)
    remove_file(result_tsv_pdb)
    remove_file(result_tsv_custom)
    remove_file(result_tsv_cath)

echo_click(msg, log=None)

Prints a message to stdout and optionally to a log file.

Parameters:

Name Type Description Default
msg str

The message to print.

required
log str

The path to the log file.

None

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def echo_click(msg, log=None):
    """
    Prints a message to stdout and optionally to a log file.

    Args:
      msg (str): The message to print.
      log (str): The path to the log file.

    Returns:
      None
    """
    click.echo(msg, nl=False, err=True)
    if log:
        with open(log, "a") as lo:
            lo.write(msg)

end_baktfold(start_time, subcommand)

Finish baktfold process and log elapsed time.

Parameters:

Name Type Description Default
start_time float

Start time of the process.

required
subcommand str

Subcommand name indicating the baktfold operation.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def end_baktfold(start_time: float, subcommand: str) -> None:
    """
    Finish baktfold process and log elapsed time.

    Parameters:
        start_time (float): Start time of the process.
        subcommand (str): Subcommand name indicating the baktfold operation.

    Returns:
        None
    """

    # Determine elapsed time
    elapsed_time = time.time() - start_time
    elapsed_time = round(elapsed_time, 2)

    cfg.run_end = datetime.now()
    run_duration = (cfg.run_end - cfg.run_start).total_seconds()
    # logger.info(f'If you use these results please cite Baktfold: https://doi.org/{bc.BAKTA_DOI}')
    logger.info(f'If you use these results please cite Baktfold: https://github.com/gbouras13/baktfold')
    logger.info(f'baktfold {subcommand} successfully finished in {int(run_duration / 60):02}:{int(run_duration % 60):02} [mm:ss].')


    # Show elapsed time for the process
    logger.info(f"baktfold {subcommand} has finished")
    logger.info("Elapsed time: " + str(elapsed_time) + " seconds")

get_type_rank(f)

ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA dynamically adjusts if 5'UTR and 3'UTR is present

Source code in src/baktfold/utils/util.py
def get_type_rank(f):
    """
    ranks eukaryotic features 1) in order of gene -> mRNA -> CDS and gene -> tRNA
    dynamically adjusts if 5'UTR and 3'UTR is present
    """
    t = f['type']
    strand = f.get('strand', '+')  # default to + if missing

    # fixed ranks
    base_order = {
        'gene': 0,
        'mRNA': 1,
        'cds': 3,
        'tRNA': 6
    }

    # dynamic UTR ordering
    if t == bc.FEATURE_5UTR:
        return 2 if strand == '+' else 4
    if t == bc.FEATURE_3UTR:
        return 4 if strand == '+' else 2

    return base_order.get(t, 99)   # non-protein features become 99

get_version()

Returns the version number from the VERSION file.

Returns:

Name Type Description
str

The version number.

Source code in src/baktfold/utils/util.py
def get_version():
    """
    Returns the version number from the VERSION file.

    Returns:
      str: The version number.
    """
    with open(baktfold_base("VERSION"), "r") as f:
        version = f.readline()
    return version

print_citation()

Prints the contents of the CITATION file to stdout.

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def print_citation():
    """
    Prints the contents of the CITATION file to stdout.

    Returns:
      None
    """
    with open(baktfold_base("CITATION"), "r") as f:
        for line in f:
            echo_click(line)

print_splash()

Prints the splash screen to stdout.

Returns:

Type Description

None

Source code in src/baktfold/utils/util.py
def print_splash():
    """
    Prints the splash screen to stdout.

    Returns:
      None
    """
    click.echo(
        """\b

  _           _    _    __      _     _ 
 | |         | |  | |  / _|    | |   | |
 | |__   __ _| | _| |_| |_ ___ | | __| |
 | '_ \ / _` | |/ / __|  _/ _ \| |/ _` |
 | |_) | (_| |   <| |_| || (_) | | (_| |
 |_.__/ \__,_|_|\_\\__|_| \___/|_|\__,_|


"""
    )

remove_directory(dir_path)

Remove a directory and all its contents if it exists.

Parameters:

Name Type Description Default
dir_path Path

Path to the directory to remove.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def remove_directory(dir_path: Path) -> None:
    """
    Remove a directory and all its contents if it exists.

    Parameters:
        dir_path (Path): Path to the directory to remove.

    Returns:
        None
    """
    if dir_path.exists():
        shutil.rmtree(dir_path)

remove_file(file_path)

Remove a file if it exists.

Parameters:

Name Type Description Default
file_path Path

Path to the file to remove.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def remove_file(file_path: Path) -> None:
    """
    Remove a file if it exists.

    Parameters:
        file_path (Path): Path to the file to remove.

    Returns:
        None
    """
    if file_path.exists():
        file_path.unlink()  # Use unlink to remove the file

replace_pipe_in_fasta(input_path)

Reads a FASTA with Biopython, replace '~PIPE~' with '|' in headers, and write the result.

Source code in src/baktfold/utils/util.py
def replace_pipe_in_fasta(input_path):
    """
    Reads a FASTA with Biopython, replace '~PIPE~' with '|' in headers, and write the result.
    """
    records = []
    for record in SeqIO.parse(input_path, "fasta"):
        record.id = record.id.replace("~PIPE~", "|")
        record.description = record.description.replace("~PIPE~", "|")
        records.append(record)

    # overwrites
    SeqIO.write(records, input_path, "fasta")

sort_euk_feature_key(f)

Sorts a feature dictionary by start, locus, type rank, and stop.

Parameters:

Name Type Description Default
f dict

The feature dictionary.

required

Returns:

Name Type Description
tuple

A tuple of the sorted values.

Source code in src/baktfold/utils/util.py
def sort_euk_feature_key(f):
    """
    Sorts a feature dictionary by start, locus, type rank, and stop.

    Args:
      f (dict): The feature dictionary.

    Returns:
      tuple: A tuple of the sorted values.
    """
    start = f.get('start', float('inf'))
    stop = f.get('stop', float('inf'))
    locus = f.get('locus')
    type_rank = get_type_rank(f)

    if locus and type_rank != 99:
        # Within a locus → sort by type rank second and stop last (if multiple CDS e.g.)
        return (start, 0, locus, type_rank, stop)
    else:
        # Non-locus or non-gene features → sort only by start
        return (start, 1, '', 99, stop)

touch_file(path)

Update the access and modification times of a file to the current time, creating the file if it does not exist.

Parameters:

Name Type Description Default
path Path

Path to the file.

required

Returns:

Type Description
None

None

Source code in src/baktfold/utils/util.py
def touch_file(path: Path) -> None:
    """
    Update the access and modification times of a file to the current time, creating the file if it does not exist.

    Parameters:
        path (Path): Path to the file.

    Returns:
        None
    """
    with open(path, "a"):
        os.utime(path, None)

check_dependencies()

Checks the dependencies and versions of non Python programs (i.e. Foldseek)

Returns:

Type Description
None

None

Source code in src/baktfold/utils/validation.py
def check_dependencies() -> None:
    """
    Checks the dependencies and versions of non Python programs (i.e. Foldseek)

    Parameters:
        None

    Returns:
        None

    """

    #############
    # foldseek
    #############
    try:
        process = sp.Popen(["foldseek", "version"], stdout=sp.PIPE, stderr=sp.STDOUT)
    except:
        logger.error("Foldseek not found. Please reinstall baktfold.")

    foldseek_out, _ = process.communicate()
    foldseek_out = foldseek_out.decode()

    foldseek_version = foldseek_out.strip()

    if "941cd33" in foldseek_version:
        foldseek_major_version=10
        foldseek_minor_version="941cd33"
        logger.info(
        f"Foldseek version found is v{foldseek_major_version}.{foldseek_minor_version}"
    )
    else:
        logger.warning(f"Foldseek version found is v{foldseek_version}")
        logger.warning(f"baktfold is recommended to be run with Foldseek v10.941cd33")
        logger.warning(f"Using a different Foldseek version is likely to work without issue, but this cannot be guaranteed.")


    logger.info("Foldseek version is ok")

check_genbank_and_prokka(filepath, euk)

Validate that an input file is a readable GenBank file and check whether it was annotated using Prokka. The function transparently supports compressed files (e.g., .gz, .bz2, .xz, .zst) via xopen.

Validation steps

• Attempts to parse the file as GenBank using Biopython. • Logs an error and returns None if no GenBank records are found. • Checks the COMMENT field of each record for a Prokka signature ("Annotated using prokka", case-insensitive). • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

Parameters

str

Path to the GenBank or compressed GenBank file.

flag

whether or not the input is eukaryotic (skips prokka)

Returns

list[SeqRecord] or None A list of Biopython SeqRecord objects if parsing succeeds. Returns None if the file is not valid GenBank or cannot be parsed.

Source code in src/baktfold/utils/validation.py
def check_genbank_and_prokka(filepath, euk):
    """
    Validate that an input file is a readable GenBank file and check whether it was
    annotated using Prokka. The function transparently supports compressed files
    (e.g., .gz, .bz2, .xz, .zst) via `xopen`.

    Validation steps:
      • Attempts to parse the file as GenBank using Biopython.
      • Logs an error and returns None if no GenBank records are found.
      • Checks the COMMENT field of each record for a Prokka signature
        ("Annotated using prokka", case-insensitive).
      • If no Prokka annotation is detected, a warning is logged but parsing continues as it is a valid genbank.

    Parameters
    ----------
    filepath : str
        Path to the GenBank or compressed GenBank file.
    euk: flag
        whether or not the input is eukaryotic (skips prokka)

    Returns
    -------
    list[SeqRecord] or None
        A list of Biopython SeqRecord objects if parsing succeeds.
        Returns None if the file is not valid GenBank or cannot be parsed.
    """

    logger.add(lambda _: sys.exit(1), level="ERROR")

    is_valid_genbank = False
    is_prokka = False

    try:
        # Use xopen so gzip/bz2/xz/zst work automatically
        with xopen(filepath, "rb") as handle:
            # SeqIO.parse expects text handle -> decode
            # Use .read() is too big; instead wrap in TextIOWrapper
            import io
            text_handle = io.TextIOWrapper(handle, encoding="utf-8", errors="replace")

            records = list(SeqIO.parse(text_handle, "genbank"))

        if not records:
            logger.error(f"Input file {filepath} is not GenBank format. Please check your input")
            return None
        else:
            is_valid_genbank = True


        # Scan comments for Prokka signature
        if not euk:
            for rec in records:
                comment = rec.annotations.get("comment", "") or ""
                if "annotated using prokka" in comment.lower():
                    is_prokka = True
                    break


            if is_prokka is False:
                logger.warning(f"Input file {filepath} does not appear to come from Prokka.")
                logger.warning(f"Conversion will proceed but no guarantee of success.")

    except Exception:
        logger.error(f"There was an error parsing {filepath}. Please check your input")
        return None

    return records

instantiate_dirs(output_dir, force)

Checks and instantiates the output directory.

Parameters:

Name Type Description Default
output_dir Union[str, Path]

Path to the output directory.

required
force bool

Force flag indicating whether to overwrite existing directory.

required

Returns:

Name Type Description
Path Path

Final output directory path.

Source code in src/baktfold/utils/validation.py
def instantiate_dirs(output_dir: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output directory.

    Parameters:
        output_dir (Union[str, Path]): Path to the output directory.
        force (bool): Force flag indicating whether to overwrite existing directory.

    Returns:
        Path: Final output directory path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.add(lambda _: sys.exit(1), level="ERROR")
    logger.info(f"Checking the output directory {output_dir}")
    if force is True:
        if Path(output_dir).exists():
            logger.info(f"Removing {output_dir} because --force was specified")
            shutil.rmtree(output_dir)
        else:
            logger.info(
                "--force was specified even though the output directory does not already exist. Continuing"
            )
    else:
        if Path(output_dir).exists():
            logger.error(
                "Output directory already exists and force was not specified. Please specify -f or --force to overwrite the output directory"
            )

    # instantiate outdir
    if Path(output_dir).exists() is False:
        Path(output_dir).mkdir(parents=True, exist_ok=True)

validate_outfile(outfile, force)

Checks and instantiates the output file for baktfold convert-prokka

Parameters:

Name Type Description Default
outfile Union[str, Path]

Path to the output file.

required
force bool

Force flag indicating whether to overwrite existing outfile.

required

Returns:

Name Type Description
Path Path

Final output file path.

Source code in src/baktfold/utils/validation.py
def validate_outfile(outfile: Union[str, Path], force: bool) -> Path:
    """
    Checks and instantiates the output file for baktfold convert-prokka

    Parameters:
        outfile (Union[str, Path]): Path to the output file.
        force (bool): Force flag indicating whether to overwrite existing outfile.

    Returns:
        Path: Final output file path.
    """

    # Checks the output directory
    # remove outdir on force
    logger.add(lambda _: sys.exit(1), level="ERROR")
    logger.info(f"Checking the output file {outfile}")
    if force is True:
        if Path(outfile).exists():
            logger.info(f"Removing {outfile} because --force was specified")
            Path(outfile).unlink()
        else:
            logger.info(
                f"--force was specified even though the output file {outfile} does not already exist. Continuing"
            )
    else:
        if Path(outfile).exists():
            logger.error(
                f"Output file {outfile} already exists and force was not specified. Please specify -f or --force to overwrite the output file"
            )

get_tophit(result_tsv, structures, cath=False)

Process Foldseek output to extract top hit and weighted bitscores.

Parameters:

Name Type Description Default
result_tsv Path

Path to the Foldseek result TSV file.

required
structures bool

Flag indicating whether structures have been added.

required
cath bool

Flag indicating whether this is for CATH database (all greedy besthits kept not just top)

False

Returns:

Type Description
Tuple[pd.DataFrame, pd.DataFrame]

Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames: 1. DataFrame containing the top functions extracted from the Foldseek output. 2. DataFrame containing weighted bitscores for different functions.

Source code in src/baktfold/results/tophit.py
def get_tophit(
    result_tsv: Path,
    structures: bool,
    cath: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process Foldseek output to extract top hit and weighted bitscores.

    Args:
        result_tsv (Path): Path to the Foldseek result TSV file.
        structures (bool): Flag indicating whether structures have been added.
        cath (bool): Flag indicating whether this is for CATH database (all greedy besthits kept not just top)

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
            1. DataFrame containing the top functions extracted from the Foldseek output.
            2. DataFrame containing weighted bitscores for different functions.
    """

    logger.info("Processing Foldseek output")

    if structures:

        col_list = [
            "query",
            "target",
            "bitscore",
            "fident",
            "evalue",
            "qStart",
            "qEnd",
            "qLen",
            "tStart",
            "tEnd",
            "tLen",
            "alntmscore",
            "lddt"
        ]
    else:

        col_list = [
            "query",
            "target",
            "bitscore",
            "fident",
            "evalue",
            "qStart",
            "qEnd",
            "qLen",
            "tStart",
            "tEnd",
            "tLen",
        ]

    foldseek_df = pd.read_csv(
        result_tsv, delimiter="\t", index_col=False, names=col_list
    )

    # replace ~PIPE~ with |
    foldseek_df["query"] = foldseek_df["query"].str.replace("~PIPE~", "|", regex=False)


    # in case the foldseek output is empty
    if foldseek_df.empty:
        logger.warning(
            "Foldseek found no hits whatsoever - please check your input if you expect hits"
        )

    else:

        # add qcov and tcov 
        foldseek_df["qCov"] = ((foldseek_df["qEnd"] - foldseek_df["qStart"] ) / foldseek_df["qLen"]).round(2)
        foldseek_df["tCov"] = ((foldseek_df["tEnd"] - foldseek_df["tStart"] ) / foldseek_df["tLen"]).round(2)

        # reorder
        qLen_index = foldseek_df.columns.get_loc("qLen")
        tLen_index = foldseek_df.columns.get_loc("tLen")

        new_column_order = (
            list(
                [
                    col
                    for col in foldseek_df.columns[: qLen_index + 1]
                    if col not in ["qCov", "tStart","tEnd",	"tLen", "tCov"]
                ]
            )
            + ["qCov", "tStart","tEnd",	"tLen", "tCov"]
            + list(
                [
                    col
                    for col in foldseek_df.columns[tLen_index + 1 :]
                    if col not in ["qCov", "tStart","tEnd",	"tLen", "tCov"]
                ]
            )
        )
        foldseek_df = foldseek_df.reindex(columns=new_column_order)


        if not cath:
            # get only the tophit - will always be the first hit for each query (top bitscore)
            foldseek_df = foldseek_df.drop_duplicates(subset="query", keep="first")
        # otherwise, the df will contain all greedy tophits from CATH


    return foldseek_df