Io

`move_product_to_note_if_exists(qualifiers)`

If a 'product' qualifier exists, append it to 'note' and remove 'product'.

Designed for the eukaryotic entries

Parameters

dict

Feature qualifiers dictionary (values are usually lists).

Returns

None Modifies qualifiers in place.

Source code in src/baktfold/io/insdc.py

def move_product_to_note_if_exists(qualifiers):
    """
    If a 'product' qualifier exists, append it to 'note' and remove 'product'.

    Designed for the eukaryotic entries

    Parameters
    ----------
    qualifiers : dict
        Feature qualifiers dictionary (values are usually lists).

    Returns
    -------
    None
        Modifies qualifiers in place.
    """
    product = qualifiers.get("product")
    if not product:
        return

    # Ensure note exists and is a list
    if "note" not in qualifiers:
        qualifiers["note"] = []

    if isinstance(product, list):
        qualifiers["note"].extend(product)
    else:
        qualifiers["note"].append(product)

    qualifiers.pop("product", None)

`revise_dbxref_insdc(dbxrefs)`

Remove INSDC non-compliant DbXrefs.

Source code in src/baktfold/io/insdc.py

def revise_dbxref_insdc(dbxrefs: Sequence[str]) -> Tuple[Sequence[str], Sequence[str]]:
    """Remove INSDC non-compliant DbXrefs."""
    insdc_valid_dbxrefs = [bc.DB_XREF_UNIPROTKB, bc.DB_XREF_GO, bc.DB_XREF_PFAM, bc.DB_XREF_RFAM]
    valid_dbxrefs = []
    invalid_dbxrefs = []
    for dbxref in dbxrefs:
        if(dbxref.split(':')[0] in insdc_valid_dbxrefs):
            valid_dbxrefs.append(dbxref)
        else:
            invalid_dbxrefs.append(dbxref)
    return valid_dbxrefs, invalid_dbxrefs

`revise_product_insdc(product)`

Revise product name for INSDC compliant submissions

Source code in src/baktfold/io/insdc.py

def revise_product_insdc(product: str):
    """Revise product name for INSDC compliant submissions"""

    old_product = product
    if(re.search(r'(uncharacteri[sz]ed)', product, flags=re.IGNORECASE)):  # replace putative synonyms)
        product = re.sub(r'(uncharacteri[sz]ed)', 'putative', product, flags=re.IGNORECASE)
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('(') != product.count(')')):  # remove unbalanced parentheses
        product = product.replace('(', '').replace(')', '')  # ToDo: find and replace only legend parentheses
        logger.info('fix product: remove unbalanced parantheses. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('[') != product.count(']')):  # remove unbalanced brackets
        product = product.replace('[', '').replace(']', '')  # ToDo: find and replace only legend bracket
        logger.info('fix product: remove unbalanced brackets. new=%s, old=%s', product, old_product)

    return product

`parse_baktfold_json_for_reconstruction(input_path, euk_override=None, custom_db_override=None, fast_override=None, program_overrides=None)`

Parse a baktfold (or bakta) JSON and reconstitute every argument needed to re-run the output writers (genome -> write_bakta_outputs, proteins -> write_bakta_proteins_outputs). No database or Foldseek run is required.

Resolution order for the runtime flags that are not otherwise recoverable from feature data is: CLI override > JSON baktfold_run provenance block > inference from features > hard default.

Returns:

Name	Type	Description
`dict`		keyed by 'mode' ('genome' \| 'proteins') plus everything the
		matching writer needs.

Source code in src/baktfold/io/json_in.py

def parse_baktfold_json_for_reconstruction(
    input_path,
    euk_override=None,
    custom_db_override=None,
    fast_override=None,
    program_overrides=None,
):
    """Parse a baktfold (or bakta) JSON and reconstitute every argument needed to
    re-run the output writers (genome -> write_bakta_outputs, proteins ->
    write_bakta_proteins_outputs). No database or Foldseek run is required.

    Resolution order for the runtime flags that are not otherwise recoverable
    from feature data is: CLI override > JSON ``baktfold_run`` provenance block >
    inference from features > hard default.

    Returns:
        dict: keyed by 'mode' ('genome' | 'proteins') plus everything the
        matching writer needs.
    """
    program_overrides = program_overrides or {}

    # ---- validate & load -------------------------------------------------
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except Exception as e:
        logger.error(f'ERROR: annotation file {input_path} not valid! {e}')
        sys.exit(1)

    logger.info(f'Parsing baktfold JSON for reconstruction: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)

    if 'features' not in data:
        logger.error("Input JSON has no 'features' key - not a valid baktfold/bakta JSON.")
        sys.exit(1)

    features = data['features']
    version = data.get('version', {})
    provenance = data.get('baktfold_run', {})

    # ---- restore faithful cfg state (headers, version strings, timing) ---
    cfg.version = version.get('baktfold', cfg.version)
    cfg.db_version = version.get('baktfold_db', cfg.db_version)
    _restore_run_timing(data)

    # ---- detect genome vs proteins mode ----------------------------------
    mode = provenance.get('mode')
    if mode not in ('genome', 'proteins'):
        mode = 'genome' if 'sequences' in data else 'proteins'

    # ---- flags resolvable for both modes ---------------------------------
    custom_db = _first_set(custom_db_override, provenance.get('custom_db'), _infer_custom_db(features))
    if custom_db is None:
        custom_db = False
    # ``fast`` is NOT reliably inferable (absence of AFDB hits is ambiguous):
    # rely on provenance, else default off.
    fast = _first_set(fast_override, provenance.get('fast'))
    if fast is None:
        fast = False

    if mode == 'proteins':
        logger.info('Proteins-mode JSON detected.')
        return {
            'mode': 'proteins',
            'data': data,
            'aas': features,
            'features': features,
            'custom_db': bool(custom_db),
            'fast': bool(fast),
            'bakta_version': version,
        }

    # ---- genome mode -----------------------------------------------------
    if 'sequences' not in data:
        logger.error("Genome-mode JSON is missing the 'sequences' block; cannot reconstruct.")
        sys.exit(1)

    prokka = provenance.get('prokka')
    if prokka is None:
        prokka = 'prokka' in version
    other_genbank = provenance.get('other_genbank')
    if other_genbank is None:
        other_genbank = ('prokka' not in version) and ('bakta' not in version)

    euk = _first_set(euk_override, provenance.get('euk'), _infer_euk(features))
    if euk is None:
        euk = False

    translation_table = provenance.get('translation_table')
    if translation_table is None:
        translation_table = _coerce_translation_table((data.get('genome') or {}).get('translation_table'))

    has_duplicate_locus = provenance.get('has_duplicate_locus')
    if has_duplicate_locus is None:
        has_duplicate_locus = _detect_duplicate_locus(features)

    programs = {}
    for name, default in _DEFAULT_PROGRAMS.items():
        programs[name] = _first_set(program_overrides.get(name), provenance.get(name), default)

    # ---- rebuild features_by_sequence (preserving the JSON feature order,
    #      which is already start-sorted per sequence) and a flattened list
    #      consistent with it. Mirrors run/compare (skip discarded, <1.10.0
    #      'contig' fallback). -------------------------------------------------
    features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    for feature in features:
        if 'discarded' in feature:
            continue
        seq_id = feature['sequence'] if 'sequence' in feature else feature.get('contig')  # <1.10.0 compat
        if seq_id is None:
            logger.warning(f"Feature missing 'sequence', skipping: id={feature.get('id')}")
            continue
        bucket = features_by_sequence.get(seq_id)
        if bucket is None:
            logger.warning(f"Feature references unknown sequence '{seq_id}', skipping: id={feature.get('id')}")
            continue
        bucket.append(feature)

    flattened = []
    for seq in data['sequences']:
        flattened.extend(features_by_sequence[seq['id']])

    if other_genbank:
        log_for_other_genbank_tools(
            programs['cds_program'], programs['trna_program'], programs['rrna_program'],
            programs['tmrna_program'], programs['ncrna_program'],
        )

    logger.info(
        f"Reconstruction settings: euk={euk}, custom_db={bool(custom_db)}, fast={bool(fast)}, "
        f"prokka={prokka}, other_genbank={other_genbank}, translation_table={translation_table}, "
        f"has_duplicate_locus={has_duplicate_locus}"
    )

    return {
        'mode': 'genome',
        'data': data,
        'features': flattened,
        'features_by_sequence': features_by_sequence,
        'has_duplicate_locus': bool(has_duplicate_locus),
        'translation_table': translation_table,
        'prokka': bool(prokka),
        'other_genbank': bool(other_genbank),
        'euk': bool(euk),
        'custom_db': bool(custom_db),
        'fast': bool(fast),
        'cds_program': programs['cds_program'],
        'trna_program': programs['trna_program'],
        'rrna_program': programs['rrna_program'],
        'tmrna_program': programs['tmrna_program'],
        'ncrna_program': programs['ncrna_program'],
        'bakta_version': version,
    }

`parse_json_input(input_path, faa_path, all_proteins, protein_json_flag)`

Parses genome annotations from input JSON file.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Path to input JSON file.	required
`faa_path`	`str`	Path to output file for hypothetical proteins.	required
`all_proteins`	`bool`	Whether to keep all proteins or only hypothetical ones.	required
`protein_json_flag`	`bool`	Whether input is protein JSON	required

Returns:

Name	Type	Description
`tuple`		A tuple containing the data, features, and whether there are duplicate locus tags.

Examples:

>>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
(data, features, False, False)

Source code in src/baktfold/io/json_in.py

def parse_json_input(input_path, faa_path, all_proteins, protein_json_flag):
    """
    Parses genome annotations from input JSON file.

    Args:
      input_path (str): Path to input JSON file.
      faa_path (str): Path to output file for hypothetical proteins.
      all_proteins (bool): Whether to keep all proteins or only hypothetical ones.
      protein_json_flag (bool): Whether input is protein JSON

    Returns:
      tuple: A tuple containing the data, features, and whether there are duplicate locus tags.

    Examples:
      >>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
      (data, features, False, False)
    """



    ############################################################################
    # Checks and configurations
    # - check parameters and setup global configuration
    # - test database
    # - test binary dependencies
    ############################################################################

    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except Exception as e:
        logger.error(f'ERROR: annotation file {annotation_path} not valid! {e}')
        sys.exit(1)

    #print(f'baktfold v{cfg.version}')

    logger.info(f'Parsing annotations from input: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)


    features = data['features']

    # features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    # for feature in data['features']:
    #     seq_id = feature['sequence'] if 'sequence' in feature else feature['contig']  # <1.10.0 compatibility
    #     sequence_features = features_by_sequence.get(seq_id)
    #     sequence_features.append(feature)

    # keep all proteins
    if all_proteins:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS ]
    else:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]


    if protein_json_flag: # this will also be only hypotheticals if protein mode (or else why not just run with the FASTA)
        version = data.get("version", {})
        return features, hypotheticals, version


    # check if dupe locus tags (euks can have multiple CDS same locus tag e.g. Cladocopium goreaui CAMXCT020000001.1)
    seen_loci = set()
    has_duplicate_locus = False

    for feat in hypotheticals:
        locus = feat['locus']
        if locus in seen_loci:
            has_duplicate_locus = True
            logger.warning("Multiple CDS per locus tag were detected in your input JSON.")
            logger.warning("CDS id (which is unique) rather than locus tag will be used for ProstT5+Foldseek searches.")
            break
        seen_loci.add(locus)

    # this is done after getting all the sequences into the dict for baktfold proteins

    if has_duplicate_locus:
        # write hypothetical proteins to file with id (not locus) as guaranteed exists and unique
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['id']}\n{feat['aa']}\n")

    else:
        # write hypothetical proteins to file - almost always
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['locus']}\n{feat['aa']}\n")

    # none of this is relevant for proteins
    try:
        genome_block = data.get("genome")

        if genome_block is None:
            logger.error("No 'genome' block found in input JSON. Please check.")
            translation_table = None
        else:
            if "translation_table" not in genome_block:
                logger.error("No translation table found in input JSON. Please check your input.")
            else:
                raw_value = genome_block["translation_table"]

                try:
                    translation_table = int(raw_value)
                    logger.info(
                        f"Translation table {translation_table} detected from input JSON"
                    )

                except (ValueError, TypeError):
                    translation_table = str(raw_value)
                    logger.warning(
                        f"Translation table '{raw_value}' is not an integer. "
                        f"Parsing it as a string."
                    )

    except Exception as e:
        logger.exception(
            f"Unexpected error while parsing translation table: {e}"
        )
        translation_table = None

    # input detection

    version = data.get("version", {})

    prokka = False
    other_genbank = False

    if "prokka" in version:
        prokka = True
        logger.info("Prokka input detected")
    if  "prokka"  not in version and "bakta" not in version:
        other_genbank = True

    logger.info('Parsing complete')

    return data, features, has_duplicate_locus, translation_table, prokka, other_genbank, version

`build_bakta_sequence_entry(rec)`

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 11 for prokka
    if gcode is None:
        gcode = 11 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

`calc_genome_stats(records)`

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig Prokka GenBank file.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
    Prokka GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

`convert_assembly_gap_feature(feature, rec, id)`

Convert a Prokka GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The assembly_gap feature from the Prokka GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style gap feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a Prokka GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)

    qualifiers = feature.qualifiers

    # Prokka may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates

    # Bakta always uses "." for strand on gaps
    strand = "."

    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id
    }

    return gap_entry

`convert_cds_feature(feature, seq_record, translation_table, id)`

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------
    start = int(feature.location.start) + 1     # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)           # already one past in BioPython
    strand = "+" if feature.location.strand == 1 else "-"

    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    # ----------- Basic qualifiers -----------
    gene = feature.qualifiers.get("gene", [None])[0]
    product = feature.qualifiers.get("product", [None])[0]

    locus_tag = feature.qualifiers.get("locus_tag", [None])[0]
    locus = locus_tag

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": feature.qualifiers.get("db_xref", [so.SO_CDS.id]),  # there will be no other db_xref 
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
    }

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

`convert_misc_rna_feature(feature, rec, id)`

Convert a Prokka GenBank misc_RNA (nc_rna) feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The misc_RNA feature from the Prokka GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_RNA feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank misc_RNA (nc_rna) feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The misc_RNA feature from the Prokka GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)

    # Coordinates (GBK is 0-based, Bakta is 1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers

    # Fields that Prokka may or may not include
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]

    # If gene missing, use product (Bakta often fills 'gene' for sRNA/tmRNA/etc.)
    if gene is None:
        gene = product

    # Extract nucleotide sequence
    nt_seq = seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]

    misc_entry = {
        "type": "ncRNA", # bakta uses ncRNA -> will be misc_rna in Prokka
        "class": None,             # bakta's classes are not in Prokka
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "score": None,
        "evalue": None,
        "db_xrefs": [so.SO_NCRNA_GENE.id],        
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return misc_entry

`convert_repeat_region_feature(feature, rec, id)`

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)

    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    strand = "?"

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]

    # Minimal Bakta-like CRISPR structure
    crispr_entry = {
        "type": "crispr",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "CRISPR"
        "rpt_type": rpt_type,       # e.g., "direct"
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": [so.SO_CRISPR.id]
    }

    return crispr_entry

`convert_rrna_feature(feature, rec, id)`

Convert a Prokka GenBank rRNA feature to Bakta-style JSON.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The rRNA feature from the Prokka GBK.	required
`rec`		str The record from the GBK.	required

Returns:

Name	Type	Description
`dict`		Bakta-style rRNA feature

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_rrna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank rRNA feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the Prokka GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """
    start = int(feature.location.start) + 1  # GBK is 0-based
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers
    product = qualifiers.get("product", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]

    # Infer gene type from product if possible
    gene_map = {
        "16S ribosomal RNA": "rrs",
        "23S ribosomal RNA": "rrl",
        "5S ribosomal RNA": "rrf"
    }
    gene = gene_map.get(product, None)

    so_map = {
        "16S ribosomal RNA": so.SO_RRNA_16S.id,
        "23S ribosomal RNA": so.SO_RRNA_23S.id,
        "5S ribosomal RNA": so.SO_RRNA_5S.id
    }

    specific_so = so_map.get(product, None)

    contig_seq = str(rec.seq)

    nt_seq = contig_seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]

    rrna_entry = {
        "type": "rRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "coverage": None,  # Prokka does not provide
        "score": None,     # Prokka does not provide
        "evalue": None,    # Prokka does not provide
        "db_xrefs": [so.SO_RRNA.id, specific_so], 
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return rrna_entry

`convert_tmrna_feature(feature, rec, id)`

Convert a Prokka GenBank tmRNA feature to Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The tmRNA feature from the Prokka GBK.	required
`rec`		str The record from the GBK	required

Returns:

Name	Type	Description
`dict`		Bakta-style tmRNA feature

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_tmrna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank tmRNA feature to Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The tmRNA feature from the Prokka GBK.
        rec: str
            The record from the GBK

    Returns:
        dict: Bakta-style tmRNA feature
    """

    seq =  str(rec.seq)

    start = int(feature.location.start) + 1  # GBK is 0-based
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]
    product = qualifiers.get("product", [None])[0]

    # Extract the nucleotide sequence of the tmRNA
    nt_seq = seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]



    tmrna_entry = {
        "type": "tmRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "db_xrefs": [so.SO_TMRNA.id], 
        # "tag": tag_info,  no tag in tmrna for prokka - no information on it in the output
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return tmrna_entry

`convert_trna_feature(feature, seq_record, id)`

Convert a Prokka tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def convert_trna_feature(feature, seq_record, id):
    """
    Convert a Prokka tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------
    start = int(feature.location.start) + 1
    stop  = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]
    locus = feature.qualifiers.get("locus_tag", [None])[0]

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in prokka output
        "id": id,
        "locus": locus
    }

    return bakta_trna

`get_bakta_style_id_from_locus_tag(records)`

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first Prokka record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/prokka_gbk_to_json.py

def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first Prokka record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 6:

                        locus_tag_prefix = locus_tag[:-6] # trims off _00001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default prokka locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

`get_transl_table(records)`

Gets translation table based off the first CDS on the first record

Source code in src/baktfold/io/prokka_gbk_to_json.py

def get_transl_table(records):
    """
    Gets translation table based off the first CDS on the first record
    """

    if not records:
        raise ValueError("No GenBank records found.")

    record_1 = records[0]

    for feat in record_1.features:
        if feat.type == "CDS":
            # Translation table may be string → convert to int
            transl = feat.qualifiers.get("transl_table", ["11"])[0]
            try:
                return int(transl)
            except ValueError:
                return 11

        # If no CDS found, default to 11
        return 11

`parse_prokka_version(record)`

Extract Prokka version from COMMENT field: Example COMMENT: 'Annotated using prokka 1.14.6 ...'

Source code in src/baktfold/io/prokka_gbk_to_json.py

def parse_prokka_version(record):
    """
    Extract Prokka version from COMMENT field:
    Example COMMENT:
    'Annotated using prokka 1.14.6 ...'
    """

    comments = record.annotations.get("comment", "") or record.annotations.get("comments", "")
    if not comments:
        return "unknown"

    m = re.search(r"[Pp]rokka[\s_]?v?(\d+\.\d+\.\d+)", comments)
    if m:
        return m.group(1)

    # fallback pattern
    m = re.search(r"prokka[^0-9]*(\d+\.\d+\.\d+)", comments)
    if m:
        return m.group(1)

    return "unknown"

`prokka_gbk_to_json(records, output_json)`

Convert Prokka-generated GenBank SeqRecord objects into a Bakta-style JSON annotation file.

This function takes one or more Biopython SeqRecord objects (typically parsed from a Prokka GenBank file) and reconstructs a JSON structure following the Bakta output schema. It extracts genome metadata, statistics, annotated features, nucleotide sequences, and Prokka version information. Features are converted to Bakta-compatible dictionaries and sorted in the same order Bakta expects.

The JSON file contains

• genome block – high-level organism metadata (genus, species, strain, etc.) • stats block – genome statistics derived from all records • features block – all features converted to Bakta-style objects, sorted by feature type and genomic position • sequences block – contig/sequence entries in Bakta format • run block – timestamps and duration placeholder • version block – Prokka version and database metadata

Parameters

list of SeqRecord

A list of Biopython SeqRecord objects already parsed from a Prokka GenBank file. Must contain at least one record. The COMMENT field is expected to contain Prokka metadata.

str

Path to the output JSON file to be written.

Returns

bool True if the JSON was successfully written.

Raises

ValueError If records is empty.

Notes

• Features are processed in a fixed Bakta-like order: ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"] • Feature IDs are generated in Bakta-style using the locus tag prefix. • Per-contig sorting is performed by genomic start coordinate. • Runtime values in the run block are placeholders (duration = "0.00 min"). • The function does not validate that the GenBank file truly originates from Prokka; that should be checked beforehand.

Source code in src/baktfold/io/prokka_gbk_to_json.py

def prokka_gbk_to_json(records, output_json):
    """
    Convert Prokka-generated GenBank SeqRecord objects into a Bakta-style JSON
    annotation file.

    This function takes one or more Biopython SeqRecord objects (typically parsed
    from a Prokka GenBank file) and reconstructs a JSON structure following the
    Bakta output schema. It extracts genome metadata, statistics, annotated features,
    nucleotide sequences, and Prokka version information. Features are converted
    to Bakta-compatible dictionaries and sorted in the same order Bakta expects.

    The JSON file contains:
      • `genome` block – high-level organism metadata (genus, species, strain, etc.)
      • `stats` block – genome statistics derived from all records
      • `features` block – all features converted to Bakta-style objects, sorted
        by feature type and genomic position
      • `sequences` block – contig/sequence entries in Bakta format
      • `run` block – timestamps and duration placeholder
      • `version` block – Prokka version and database metadata

    Parameters
    ----------
    records : list of SeqRecord
        A list of Biopython SeqRecord objects already parsed from a Prokka GenBank
        file. Must contain at least one record. The COMMENT field is expected to
        contain Prokka metadata.

    output_json : str
        Path to the output JSON file to be written.

    Returns
    -------
    bool
        True if the JSON was successfully written.

    Raises
    ------
    ValueError
        If `records` is empty.

    Notes
    -----
    • Features are processed in a fixed Bakta-like order:
        ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"]
    • Feature IDs are generated in Bakta-style using the locus tag prefix.
    • Per-contig sorting is performed by genomic start coordinate.
    • Runtime values in the `run` block are placeholders (duration = "0.00 min").
    • The function does not validate that the GenBank file truly originates from
      Prokka; that should be checked beforehand.
    """

    complete = False

    # records = list(SeqIO.parse(genbank_path, "genbank"))

    if len(records) == 0:
        raise ValueError("No GenBank records found.")
    elif len(records) >= 1:
        prokka_version = parse_prokka_version(records[0])

    translation_table = get_transl_table(records)

    bakta_id_prefix = get_bakta_style_id_from_locus_tag(records)

    # ----------------------------
    # Genome block 
    # ----------------------------

    genome_block = {
        "genus": None,
        "species": None,
        "strain": None,
        "taxon": None,
        "complete": True,
        "gram": "?",
        "translation_table": translation_table
    }

    # ----------------------------
    # Stats - on whole GBK
    # ----------------------------
    stats_block = calc_genome_stats(records)

    # ----------------------------
    # Features block
    # ----------------------------

    # order by id creation block to match how ids are generated bakta
    ORDER = ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"]
    # bakta has oriC detection too - prokka doesn't I think so leaving it out 

    features = []
    i = 1
    for rec in records:
        for ftype in ORDER:
            for feat in rec.features:
                if feat.type != ftype:
                    continue

                id = f"{bakta_id_prefix}_{i}"

                if ftype == "CDS":
                    features.append(convert_cds_feature(feat, rec, translation_table, id))
                elif ftype == "tRNA":
                    features.append(convert_trna_feature(feat, rec, id))
                elif ftype == "tmRNA":
                    features.append(convert_tmrna_feature(feat, rec, id))
                elif ftype == "rRNA":
                    features.append(convert_rrna_feature(feat, rec, id))
                elif ftype == "misc_RNA":
                    features.append(convert_misc_rna_feature(feat, rec, id))
                elif ftype == "repeat_region":
                    features.append(convert_repeat_region_feature(feat, rec, id))
                elif ftype == "assembly_gap":
                    features.append(convert_assembly_gap_feature(feat, rec, id))

                i +=1


    # ----------------------------
    # Sort features within each contig like Bakta
    # ----------------------------

    features_by_contig = defaultdict(list)
    for f in features:
        features_by_contig[f["sequence"]].append(f)

    # Sort each contig's features by start and flatten back
    sorted_features = []
    for contig_id in features_by_contig:
        sorted_features.extend(sorted(features_by_contig[contig_id], key=lambda x: x["start"]))

    # Replace the original features list
    features = sorted_features

    # ----------------------------
    # Sequences block
    # ----------------------------

    sequences = []
    for rec in records:
        sequences.append(build_bakta_sequence_entry(rec))

    # just to put in a time
    start_time = datetime.now()

    bakta_json = {
        "genome": genome_block,
        "stats": stats_block,
        "features": features,
        "sequences": sequences,
        "run": {
            "start": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end": start_time.strftime("%Y-%m-%d %H:%M:%S"),       
            "duration": "0.00 min"   
        },
        "version": {
            "prokka": prokka_version,  
            "db": {
                "version": prokka_version,
                "type": "prokka_dbs"
            }
        }
    }


    Path(output_json).parent.mkdir(parents=True, exist_ok=True)
    with open(output_json, "w") as fh:
        json.dump(bakta_json, fh, indent=4)
        complete = True

    return complete

`random_n_letter_id(n=4)`

generates a n letter id prefix

n=2 to append to Prokka locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/prokka_gbk_to_json.py

def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to  Prokka locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

Module for manipulating genbank files some taken from phynteny https://github.com/susiegriggo/Phynteny

`get_genbank(genbank)`

Convert a GenBank file to a dictionary.

This function reads a GenBank file and converts it into a dictionary.

Parameters:

Name	Type	Description	Default
`genbank`	`Path`	Path to the GenBank file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the GenBank file.

Raises:

Type	Description
`ValueError`	If the provided file is not a GenBank file.

Source code in src/baktfold/io/handle_genbank.py

def get_genbank(genbank: Path) -> dict:
    """
    Convert a GenBank file to a dictionary.

    This function reads a GenBank file and converts it into a dictionary.

    Args:
        genbank (Path): Path to the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.

    Raises:
        ValueError: If the provided file is not a GenBank file.
    """

    logger.info(f"Checking if input {genbank} is a Genbank format file")
    logger.info(f"If so, also detecting the likely input style out of Pharokka, Bakta and NCBI Refseq style.")
    def parse_records(handle):
        """
    Parses a genbank file and returns a list of SeqRecords.

    Args:
      file_path (str): The path to the genbank file to parse.
      file_format (str): The format of the genbank file. Defaults to 'genbank'.

    Returns:
      list: A list of SeqRecords parsed from the genbank file.

    Examples:
      >>> parse_records('example.gb')
      [SeqRecord(seq=Seq('ATGC'), id='example', name='example', description='example', dbxrefs=[]), ...]
    """
        try:
            records = list(SeqIO.parse(handle, "gb"))
            if not records:
                return {}, None
            gb_dict = {record.id: record for record in records}
            record = records[0]

            comment = record.annotations.get("comment", "")
            cds_feature = next((f for f in record.features if f.type == "CDS"), None)

            if cds_feature is None:
                logger.error(f"{genbank} appears to be a Genbank formatted file but no CDS was found. Please check your input.")
                return gb_dict, None

            # Check if 'Bakta' appears in the Comment - will appear there
            if "Bakta" in comment and "locus_tag" in cds_feature.qualifiers:
                logger.info(f"Detected Bakta style input Genbank. Using locus_tag qualifier from Bakta as the CDS IDs for Phold.")
                method = "Bakta"
            else:
                if "phrog" not in cds_feature.qualifiers and "protein_id" in cds_feature.qualifiers:
                    logger.info(f"Detected NCBI Refseq style input Genbank. Using protein_id qualifier as the CDS IDs for Phold.")
                    method = "NCBI"
                elif "phrog" in cds_feature.qualifiers and "ID" in cds_feature.qualifiers:
                    logger.info(f"Detected Pharokka style input Genbank. Using ID qualifier from Pharokka as the CDS IDs for Phold.")
                    method = "Pharokka"
                else:
                    logger.error(
                        f"Feature {cds_feature} could not be parsed. Therefore, the input style format for {genbank} could not be detected. Please check your input."
                    )
                    # Bind ``method`` so the return below never raises
                    # ``UnboundLocalError`` (which the broad except would
                    # mislabel as "not a genbank file").
                    method = None
            return identify_long_ids(gb_dict), method
        except Exception as e:
            logger.warning(f"{genbank} is not a genbank file: {e}")
            return {}, None

    try:
        if is_gzip_file(genbank.strip()):
            with gzip.open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
        else:
            with open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
    except Exception as e:
        logger.warning(f"{genbank} is not a genbank file: {e}")
        return {}, None

`get_proteins(fasta)`

Convert an Amino Acid FASTA file to a dictionary.

This function reads a AA FASTA file and converts it into a dictionary.

Parameters:

Name	Type	Description	Default
`fasta`	`Path`	Path to the FASTA file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the FASTA file.

Raises:

Type	Description
`ValueError`	If the provided file is not a FASTA file.

Source code in src/baktfold/io/handle_genbank.py

def get_proteins(fasta: Path) -> dict:
    """
    Convert an Amino Acid FASTA file to a dictionary.

    This function reads a AA FASTA file and converts it into a dictionary.

    Args:
        fasta (Path): Path to the FASTA file.

    Returns:
        dict: A dictionary representation of the FASTA file.

    Raises:
        ValueError: If the provided file is not a FASTA file.
    """

    if is_gzip_file(fasta.strip()):
        try:
            fasta_dict = {}
            with gzip.open(fasta.strip(), "rt") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    else:
        try:
            fasta_dict = {}
            with open(fasta.strip(), "rt", errors="ignore") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    return fasta_dict

`identify_long_ids(gb_dict)`

Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

Parameters:

Name	Type	Description	Default
`dict`		A dictionary representation of the GenBank file.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary representation of the GenBank file.

Source code in src/baktfold/io/handle_genbank.py

def identify_long_ids(gb_dict: dict) -> dict:
    """

    Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

    Args:
        dict: A dictionary representation of the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.
    """

    # remove spaces in ID/locus tag
    for record_id, record in gb_dict.items():
        for cds_feature in record.features:
            try:
                # if pharokka > 54 char IDs/locus tage, phold/biopython will parse with a space
                # no spaces in
                # for really long CDS IDs (over 54 chars), a space will be introduced
                # this is because the ID will go over a second line
                # weird bug noticed it on the Mgnify contigs annotated with Pharokka
                cds_id = cds_feature.qualifiers["ID"][0]
                if len(cds_id) >= 54:
                    logger.warning(
                        f"The CDS ID is {cds_id} is longer than 54 characters. It is recommended that you use short contig headers (which will therefore lead to shorter CDS ids)."
                    )
                    cds_feature.qualifiers["ID"][0] = cds_feature.qualifiers["ID"][
                        0
                    ].replace(" ", "")
            except Exception:
                # will be GenBank/NCBI formatted
                # ID isn't a field and should be properly formatted - famous last words probably
                continue

    return gb_dict

`is_gzip_file(f)`

Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

This is an elegant solution to test whether a file is gzipped by reading the first two characters. I also use a version of this in fastq_pair if you want a C version :) See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration

Parameters:

Name	Type	Description	Default
`f`	`Path`	The file to test.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the file is gzip compressed, otherwise False.

Source code in src/baktfold/io/handle_genbank.py

def is_gzip_file(f: Path) -> bool:
    """
    Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

    This is an elegant solution to test whether a file is gzipped by reading the first two characters.
    I also use a version of this in fastq_pair if you want a C version :)
    See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration
    Args:
        f (Path): The file to test.

    Returns:
        bool: True if the file is gzip compressed, otherwise False.
    """
    with open(f, "rb") as i:
        return binascii.hexlify(i.read(2)) == b"1f8b"

`open_protein_fasta_file(input_file)`

Open a fasta file, whether it is gzipped or plain text.

input_file (str): The path to the fasta file, either gzipped or plain.

Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.

Source code in src/baktfold/io/handle_genbank.py

def open_protein_fasta_file(input_file: str) -> Union[IO[str], gzip.GzipFile]:
    """
    Open a fasta file, whether it is gzipped or plain text.

    Parameters:
    input_file (str): The path to the fasta file, either gzipped or plain.

    Returns:
    Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.
    """
    input_file = Path(input_file)

    if input_file.suffix == ".gz":
        return gzip.open(input_file, "rt")
    else:
        return open(input_file, "r")

`add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None)`

Add optional INSDC qualifiers to a feature entry dict in Bakta style.

Parameters

dict

The feature dictionary being built.

dict

The qualifiers dictionary from Bio.SeqFeature.

set or list

Qualifiers expected to be single-valued (take the first if multiple).

set or list

Qualifiers that can have multiple values (keep as list if >1, else single value).

Source code in src/baktfold/io/eukaryotic_to_json.py

def add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None):
    """
    Add optional INSDC qualifiers to a feature entry dict in Bakta style.

    Parameters
    ----------
    entry : dict
        The feature dictionary being built.
    qualifiers : dict
        The qualifiers dictionary from Bio.SeqFeature.
    single_valued : set or list
        Qualifiers expected to be single-valued (take the first if multiple).
    multi_valued : set or list
        Qualifiers that can have multiple values (keep as list if >1, else single value).
    """

    single_valued = single_valued or set()
    multi_valued = multi_valued or set()

    # Multi-valued qualifiers
    for key in multi_valued:
        vals = qualifiers.get(key)
        if vals:
            entry[key] = vals if len(vals) > 1 else vals[0]

    # Single-valued qualifiers
    for key in single_valued:
        vals = qualifiers.get(key)
        if vals:
            if key == "locus_tag":
                entry["locus"] = vals[0] # this is what bakta needs
            else:
                entry[key] = vals[0]

`build_bakta_sequence_entry(rec)`

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/eukaryotic_to_json.py

def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 1 for euks
    if gcode is None:
        gcode = 1 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

`calc_genome_stats(records)`

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig GenBank file.

Source code in src/baktfold/io/eukaryotic_to_json.py

def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
     GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

`convert_assembly_gap_feature(feature, rec, id)`

Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The assembly_gap feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    strand = "." # bakta uses "." for strand on gaps
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    #  may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates


    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id,
    }

    # no need to add estimated length separately - it is covered by length in the json 

    # if est_len:
    #     gap_entry["estimated_length"] = est_len

    return gap_entry

`convert_cds_feature(feature, seq_record, translation_table, id)`

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    qualifiers = feature.qualifiers

    # ----------- Basic qualifiers -----------
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    note = qualifiers.get("note", [None])[0]
    locus = locus_tag

    # pseudo

    protein_id = qualifiers.get("protein_id", [None])[0]

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xref = qualifiers.get("db_xref", [so.SO_CDS.id])

    # Append so.SO_CDS.id only if it’s not already present
    if so.SO_CDS.id not in db_xref:
        db_xref.append(so.SO_CDS.id)

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": db_xref,  
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "note": note,
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
        "protein_id": protein_id
    }

# Feature Key           CDS

# Definition            coding sequence; sequence of nucleotides that
#                       corresponds with the sequence of amino acids in a
#                       protein (location includes stop codon); 
#                       feature includes amino acid conceptual translation.

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /codon_start=<1 or 2 or 3>
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /exception="[exception_value]"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /protein_id="<identifier>"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /ribosomal_slippage
#                       /standard_name="text"
#                       /translation="text"
#                       /transl_except=(pos:<location>,aa:<amino_acid>)
#                       /transl_table =<integer>
#                       /trans_splicing

    multi_valued = {"EC_number", "exception", "experiment", "function",  "gene_synonym",  "inference", }
    single_valued = {"allele", "artificial_location",  "map", "number",  "old_locus_tag", "operon", "phenotype", "pseudogene", "standard_name", "transl_except", "transl_table"}

    add_optional_qualifiers(bakta_cds, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "ribosomal_slippage", "trans_splicing"]:
        if flag in qualifiers:
            bakta_cds[flag] = flag in qualifiers

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

`convert_exon_feature(feature, rec, id)`

Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

Parameters

Bio.SeqFeature

The exon feature from the GenBank record.

Bio.SeqRecord

The full GenBank record.

str

Unique feature ID.

Returns

dict Bakta-style exon feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_exon_feature(feature, rec, id):
    """
    Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

    Parameters
    ----------
    feature : Bio.SeqFeature
        The exon feature from the GenBank record.
    rec : Bio.SeqRecord
        The full GenBank record.
    id : str
        Unique feature ID.

    Returns
    -------
    dict
        Bakta-style exon feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    db_xrefs = qualifiers.get("db_xref", [])

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    # Extract commonly used INSDC qualifiers
    exon_entry = {
            "type": "exon",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }

    multi_valued = {"EC_number","experiment","function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"   }

    add_optional_qualifiers(exon_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            exon_entry[flag] = True

    return exon_entry

`convert_gene_feature(feature, rec, id)`

Convert a Funannotate GenBank gene feature to Bakta-style JSON.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The rRNA feature from the GBK.	required
`rec`		str The record from the GBK.	required

Returns:

Name	Type	Description
`dict`		Bakta-style rRNA feature

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_gene_feature(feature, rec, id):
    """
    Convert a Funannotate GenBank gene feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")



    gene_entry = {
        "type": "gene",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "db_xrefs": [so.SO_GENE.id], 
        "id": id,
        "locus": locus_tag
    }


# Feature Key           gene 


# Definition            region of biological interest identified as a gene 
#                       and for which a name has been assigned;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /phenotype="text"
#                       /standard_name="text"
#                       /trans_splicing


# Comment               the gene feature describes the interval of DNA that 
#                       corresponds to a genetic trait or phenotype; the feature is,
#                       by definition, not strictly bound to it's positions at the 
#                       ends;  it is meant to represent a region where the gene is 
#                       located.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(gene_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            gene_entry[flag] = flag in qualifiers

    return gene_entry

`convert_mat_peptide_feature(feature, rec, id)`

Convert a mat_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mat_peptide_feature(feature, rec, id):
    """
    Convert a mat_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)


    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    so_code =  so.SO_MAT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    qualifiers = feature.qualifiers


    # Extract commonly used INSDC qualifiers
    mat_peptide_entry = {
            "type": "mat_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            # Join support
            "starts": starts,
            "stops": stops,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }


# Feature Key           mat_peptide


# Definition            mature peptide or protein coding sequence; coding
#                       sequence for the mature or final peptide or protein
#                       product following post-translational modification; the
#                       location does not include the stop codon (unlike the
#                       corresponding CDS);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"EC_number","experiment", "function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"}

    add_optional_qualifiers(mat_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers) - no flags
    # for flag in ["pseudo"]:
    #     if flag in qualifiers:
    #         mat_peptide_entry[flag] = True


    #  mat_peptide     complement(join(194724303..194724321,194744661..194744721,
    #                  194746996..194747031,194750435..194750476,
    #                  194757818..194757865,194759962..194760144,
    #                  194764890..194765087,194765856..194765944,
    #                  194767641..194767743,194768400..194768583))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /product="Membrane cofactor protein. /id=PRO_0000238971"
    #                  /note="propagated from UniProtKB/Swiss-Prot (O88174.1)"

    return mat_peptide_entry

`convert_misc_feature(feature, rec, id)`

Convert a misc feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_feature feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_misc_feature(feature, rec, id):
    """
    Convert a misc feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_feature feature.
    """

    seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_MISC_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    misc_feature_entry = {
            "type": "misc_feature",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": db_xrefs,


        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note", "phenotype"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name",  "pseudogene"}

    add_optional_qualifiers(misc_feature_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            misc_feature_entry[flag] = True

# Feature Key           misc_feature


# Definition            region of biological interest which cannot be described
#                       by any other feature key; a new or rare feature;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /phenotype="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               this key should not be used when the need is merely to 
#                       mark a region in order to comment on it or to use it in 
#                       another feature's location

    #  misc_feature    join(78488668..78488692,78499322..78499359)
    #                  /gene="Mogat1"
    #                  /gene_synonym="0610030A14Rik; 1110064N14Rik; Dgat2l;
    #                  Dgat2l1; mDC2; MGAT1; WI1-2612I11.1"
    #                  /note="propagated from UniProtKB/Swiss-Prot (Q91ZV4.2);
    #                  transmembrane region"

    #  misc_feature    78179419..78180585
    #                  /standard_name="Pax3 upstream hypaxial enhancer"
    #                  /note="Region: biological region; Derived by automated
    #                  computational analysis using gene prediction method:
    #                  RefSeqFE."
    #                  /function="regulatory_interactions: LOC107980439 | Pax3"
    #                  /db_xref="GeneID:107980442"    

    return misc_feature_entry

`convert_misc_rna_feature(feature, rec, id)`

Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The assembly_gap feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.

    """

        # from ensemble genomes
        # misc_RNA        complement(437333..442742)
        #             /gene="YPL060C-A"
        #             /note="transposable_element"
        #             /standard_name="YPL060C-A"

    # Coordinates (1-based)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]

# Feature Key           misc_RNA


# Definition            any transcript or RNA product that cannot be defined by
#                       other RNA keys (prim_transcript, precursor_RNA, mRNA,
#                       5'UTR, 3'UTR, exon, CDS, sig_peptide, transit_peptide,
#                       mat_peptide, intron, polyA_site, ncRNA, rRNA and tRNA);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    misc_rna_entry = {
        "type": "misc_RNA", # expects lowercase 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "gene": gene,
        "id": id
    }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "product", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(misc_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            misc_rna_entry[flag] = flag in qualifiers

    return misc_rna_entry

`convert_mobile_element_feature(feature, rec, id)`

Convert a GenBank mobile_element feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mobile_element_feature(feature, rec, id):
    """
    Convert a GenBank mobile_element feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier check (INSDC requirement)
    mobile_element_type = qualifiers.get("mobile_element_type", [None])[0]
    if mobile_element_type is None:
        raise ValueError(
            f"mobile_element feature {id} is missing mandatory "
            "/mobile_element_type qualifier"
        )

    so_code =  so.SO_MOBILE_ELEMENT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


# Feature Key           mobile_element


# Definition            region of genome containing mobile elements;

# Mandatory qualifiers  /mobile_element_type="<mobile_element_type>
#                       [:<mobile_element_name>]"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /standard_name="text"


    # Extract commonly used INSDC qualifiers
    mobile_element_entry = {
            "type": "mobile_element",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs,
                    # Mandatory
            "mobile_element_type": mobile_element_type,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "standard_name", "rpt_family", "rpt_type"}

    add_optional_qualifiers(mobile_element_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["pseudo"]:
    #   if flag in qualifiers:
    #     mobile_element_entry[flag] = True


    #  mobile_element  57369551..57369723
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: RefSeqFE."
    #                  /mobile_element_type="SINE:AmnSINE1"
    #                  /db_xref="GeneID:106707176"

    return mobile_element_entry

`convert_mrna_feature(feature, rec, id)`

Convert a funannotate mrna feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style mRNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_mrna_feature(feature, rec, id):
    """
    Convert a funannotate mrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style mRNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    else:
        starts = None
        stops = None


    qualifiers = feature.qualifiers


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")


    mrna_entry = {
        "type": "mRNA", 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "db_xrefs": [so.SO_MRNA.id],        
        "id": id,
        "locus": locus_tag
    }



# Feature Key           mRNA


# Definition            messenger RNA; includes 5'untranslated region (5'UTR),
#                       coding sequences (CDS, exon) and 3'untranslated region
#                       (3'UTR);

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "artificial_location", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(mrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            mrna_entry[flag] = flag in qualifiers

    return mrna_entry

`convert_ncrna_feature(feature, rec, id)`

Convert a ncrna feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_ncrna_feature(feature, rec, id):
    """
    Convert a ncrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_NCRNA_GENE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Mandatory qualifier (INSDC requirement)
    ncrna_class = qualifiers.get("ncRNA_class", [None])[0]
    if ncrna_class is None:
        raise ValueError(
            f"ncRNA feature {id} is missing mandatory /ncRNA_class qualifier"
        )

    ncrna_entry = {
        "type": "ncRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Mandatory
        "ncRNA_class": ncrna_class,

        # Multi-valued qualifiers
        "db_xrefs": db_xrefs,


    }

# Feature Key           ncRNA

# Definition            a non-protein-coding gene, other than ribosomal RNA and
#                       transfer RNA, the functional molecule of which is the RNA
#                       transcript;

# Mandatory qualifiers  /ncRNA_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

# Example               /ncRNA_class="miRNA"
#                       /ncRNA_class="siRNA"
#                       /ncRNA_class="scRNA"       

# Comment               the ncRNA feature is not used for ribosomal and transfer
#                       RNA annotation, for which the rRNA and tRNA feature keys
#                       should be used, respectively;

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(ncrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            ncrna_entry[flag] = flag in qualifiers

    #  ncRNA           join(189791085..189791793,189798997..189799081,
    #                  189819873..189820364,189821703..189822337)
    #                  /ncRNA_class="lncRNA"
    #                  /gene="Gm30446"
    #                  /product="predicted gene, 30446, transcript variant X6"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: Gnomon. Supporting evidence
    #                  includes similarity to: 100% coverage of the annotated
    #                  genomic feature by RNAseq alignments, including 2 samples
    #                  with support for all annotated introns"
    #                  /transcript_id="XR_001779629.1"
    #                  /db_xref="GeneID:102632350"
    #                  /db_xref="MGI:MGI:5589605"

    return ncrna_entry

`convert_precursor_rna_feature(feature, rec, id)`

Convert a GenBank precursor_RNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_precursor_rna_feature(feature, rec, id):
    """
    Convert a GenBank precursor_RNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_PRECURSOR_RNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    precursor_rna_entry = {
            "type": "precursor_RNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(precursor_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["trans_splicing"]:
        if flag in qualifiers:
            precursor_rna_entry[flag] = True

#     Feature Key           precursor_RNA


# Definition            any RNA species that is not yet the mature RNA product;
#                       may include ncRNA, rRNA, tRNA, 5' untranslated region
#                       (5'UTR), coding sequences (CDS, exon), intervening
#                       sequences (intron) and 3' untranslated region (3'UTR);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"  
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /standard_name="text"
#                       /trans_splicing


    #  precursor_RNA   194719348..194719428
    #                  /gene="Mir29b-2"
    #                  /gene_synonym="mir-29b-2; Mirn29b-2"
    #                  /product="microRNA 29b-2"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: BestRefSeq."
    #                  /transcript_id="NR_029809.1"
    #                  /db_xref="GeneID:723963"
    #                  /db_xref="MGI:MGI:3619047"
    #                  /db_xref="miRBase:MI0000712"

    return precursor_rna_entry

`convert_proprotein_propeptide_feature(feature, rec, id)`

Convert a proprotein or propeptide feature to a Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style proprotein feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_proprotein_propeptide_feature(feature, rec, id):
    """
    Convert a proprotein or propeptide feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style proprotein feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_PROPEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    propeptide_entry = {
        "type": "propeptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }


# Feature Key           propeptide


# Definition            propeptide coding sequence; coding sequence for the domain of a 
#                       proprotein that is cleaved to form the mature protein product.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(propeptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            propeptide_entry[flag] = True

    #  proprotein      join(171053237..171053367,171053712..171053832)
    #                  /gene="Apoa2"
    #                  /gene_synonym="Alp-2; Apo-AII; Apoa-2; ApoA-II; ApoAII;
    #                  Hdl-1"
    #                  /product="apolipoprotein A-II proprotein"  

    return propeptide_entry

`convert_protein_bind_feature(feature, rec, id)`

Convert a GenBank protein_bind feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_protein_bind_feature(feature, rec, id):
    """
    Convert a GenBank protein_bind feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    bound_moiety = qualifiers.get("bound_moiety", [None])[0]
    if bound_moiety is None:
        raise ValueError(
            f"protein_bind feature {id} is missing mandatory /bound_moiety qualifier"
        )

    so_code =  so.SO_PROTEINBIND.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    protein_bind_entry = {
        "type": "protein_bind",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "bound_moiety": bound_moiety,
        "db_xrefs": db_xrefs,
        "id": id,
    }


# Feature Key           protein_bind


# Definition            non-covalent protein binding site on nucleic acid;

# Mandatory qualifiers  /bound_moiety="text"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /standard_name="text"

# Comment               note that feature key regulatory with /regulatory_class="ribosome_binding_site"
#                       should be used for ribosome binding sites.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(protein_bind_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["trans_splicing"]:
    #     if flag in qualifiers:
    #         protein_bind_entry[flag] = True

    return protein_bind_entry

`convert_regulatory_feature(feature, rec, id)`

Convert a GenBank regulatory feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_regulatory_feature(feature, rec, id):
    """
    Convert a GenBank regulatory feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    regulatory_class = qualifiers.get("regulatory_class", [None])[0]
    if regulatory_class is None:
        raise ValueError(
            f"regulatory feature {id} is missing mandatory /regulatory_class qualifier"
        )

    so_code =  so.SO_REGULATORY_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    regulatory_entry = {
            "type": "regulatory",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "regulatory_class": regulatory_class,
            "db_xrefs": db_xrefs,
            "id": id,
        }


# Feature Key           regulatory


# Definition            any region of sequence that functions in the regulation of
#                       transcription, translation, replication, recombination, or chromatin structure;

# Mandatory qualifiers  /regulatory_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /bound_moiety="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /phenotype="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment	              This feature has replaced the following Feature Keys on 15-DEC-2014:
#                       enhancer, promoter, CAAT_signal, TATA_signal, -35_signal, -10_signal,
#                       RBS, GC_signal, polyA_signal, attenuator, terminator, misc_signal.

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "bound_moiety", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(regulatory_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            regulatory_entry[flag] = True

    #  regulatory      195030925..195032349
    #                  /regulatory_class="enhancer"
    #                  /experiment="EXISTENCE:reporter gene assay evidence
    #                  [ECO:0000049][PMID:32912294]"
    #                  /note="C2 STARR-seq-only enhancer starr_03508"
    #                  /function="activates a minimal SCP1 promoter by STARR-seq
    #                  in ground-state (2iL) and metastable (SL) mouse embryonic
    #                  stem cells {active_cell/tissue: mESC(E14 +2i+LIF or
    #                  +serum+LIF)}"
    #                  /db_xref="GeneID:131296982"

    return regulatory_entry

`convert_repeat_region_feature(feature, rec, id)`

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    strand = "."
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           repeat_region


# Definition            region of genome containing repeating units;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /rpt_unit_range=<base_range>
#                       /rpt_unit_seq="text"
#                       /satellite="<satellite_type>[:<class>][ <identifier>]"
#                       /standard_name="text"

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Minimal Bakta-like CRISPR structure
    repeat_region_entry = {
        "type": "repeat_region",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "LINE1" - should always be there
        "rpt_type": rpt_type,   
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat if crispr
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": db_xrefs
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"satellite", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(repeat_region_entry, qualifiers, single_valued, multi_valued)


    return repeat_region_entry

`convert_rrna_feature(feature, rec, id)`

Convert a GenBank rRNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_rrna_feature(feature, rec, id):
    """
    Convert a GenBank rRNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_RRNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

# Feature Key           rRNA


# Definition            mature ribosomal RNA; RNA component of the
#                       ribonucleoprotein particle (ribosome) which assembles
#                       amino acids into proteins.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               rRNA sizes should be annotated with the /product
#                       qualifier.  


    rrna_entry = {
            "type": "rRNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(rrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            rrna_entry[flag] = True


    #  rRNA            46413357..46413475
    #                  /gene="n-R5s211"
    #                  /product="5S ribosomal RNA"
    #                  /inference="COORDINATES: nucleotide
    #                  motif:Rfam:12.0:RF00001"
    #                  /inference="COORDINATES: profile:INFERNAL:1.1.1"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: cmsearch."
    #                  /transcript_id="XR_004936691.1"
    #                  /db_xref="GeneID:115487577"
    #                  /db_xref="RFAM:RF00001"
    #                  /db_xref="MGI:MGI:4422076"

    return rrna_entry

`convert_sig_peptide_feature(feature, rec, id)`

Convert a sig_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style sig_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_sig_peptide_feature(feature, rec, id):
    """
    Convert a sig_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style sig_peptide feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_SIGNAL_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    sig_peptide_entry = {
        "type": "sig_peptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }




# Feature Key           sig_peptide


# Definition            signal peptide coding sequence; coding sequence for an
#                       N-terminal domain of a secreted protein; this domain is
#                       involved in attaching nascent polypeptide to the
#                       membrane leader sequence;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(sig_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            sig_peptide_entry[flag] = True


    #  sig_peptide     complement(join(194768584..194768588,
    #                  194774407..194774533))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /inference="COORDINATES: ab initio prediction:SignalP:6.0"

    return sig_peptide_entry

`convert_transit_peptide_feature(feature, rec, id)`

Convert a transit_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The mRNA feature from the GBK.	required
`rec`		Bio.SeqRecord The record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Bakta-style transit_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_transit_peptide_feature(feature, rec, id):
    """
    Convert a transit_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style transit_peptide feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_TRANSIT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    transit_peptide_entry = {
            "type": "transit_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": qualifiers.get("db_xref", []),

        }



# Feature Key           transit_peptide


# Definition            transit peptide coding sequence; coding sequence for an
#                       N-terminal domain of a nuclear-encoded organellar
#                       protein; this domain is involved in post-translational
#                       import of the protein into the organelle;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(transit_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            transit_peptide_entry[flag] = True

    #  transit_peptide complement(join(180006550..180006849,
    #                  180009627..180009803))
    #                  /gene="Coq8a"
    #                  /gene_synonym="4632432J16Rik; Adck3; Cabc1; mKIAA0451"
    #                  /note="Mitochondrion.
    #                  /evidence=ECO:0000250|UniProtKB:Q8NI60; propagated from
    #                  UniProtKB/Swiss-Prot (Q60936.2)"

    return transit_peptide_entry

`convert_trna_feature(feature, seq_record, id)`

Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_trna_feature(feature, seq_record, id):
    """
    Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))



    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]

    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna_entry = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in output
        "locus": locus_tag,
        "id": id,
    }

# Feature Key           tRNA


# Definition            mature transfer RNA, a small RNA molecule (75-85 bases
#                       long) that mediates the translation of a nucleic acid
#                       sequence into an amino acid sequence;

# Optional qualifiers   /allele="text"
#                       /anticodon=(pos:<location>,aa:<amino_acid>,seq:<text>)
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",    "old_locus_tag", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(bakta_trna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            bakta_trna_entry[flag] = flag in qualifiers

    return bakta_trna_entry

`convert_utr_region_feature(feature, rec, id, three)`

Convert a UTR GenBank feature to a simplified Bakta-style feature.

Parameters:

Name	Type	Description	Default
`feature`		Bio.SeqFeature The UTR feature from the GBK.	required
`rec`		Bio.SeqRecord The full GenBank record containing the sequence.	required

Returns:

Name	Type	Description
`dict`		Simplified Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py

def convert_utr_region_feature(feature, rec, id, three):
    """
    Convert a UTR GenBank feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The UTR feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style feature.
    """

    if three:
        type = "3'UTR"
        so_code =  so.SO_3UTR.id
    else:
        type = "5'UTR"
        so_code =  so.SO_5UTR.id

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # always just take the positive strand to get the NT seq (UTR region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           3'UTR


# Definition            1) region at the 3' end of a mature transcript (following 
#                       the stop codon) that is not translated into a protein;
#                       2) region at the 3' end of an RNA virus (following the last stop
#                       codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing



# Feature Key           5'UTR


# Definition            1) region at the 5' end of a mature transcript (preceding 
#                       the initiation codon) that is not translated into a protein;
#                       2) region at the 5' end of an RNA virus genome (preceding the first 
#                       initiation codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    # Minimal Bakta-like structure
    utr_entry = {
        "type": type,
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "product": note, 
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        "db_xrefs": db_xrefs,
        "locus": locus_tag
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",   "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(utr_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            utr_entry[flag] = flag in qualifiers

    return utr_entry

`get_bakta_style_id_from_locus_tag(records)`

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/eukaryotic_to_json.py

def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first  record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 7:

                        locus_tag_prefix = locus_tag[:-7] # trims off _000001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default  locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

`random_n_letter_id(n=4)`

generates a n letter id prefix

n=2 to append to locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/eukaryotic_to_json.py

def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to   locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

`write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, cds_program, trna_program, rrna_program, tmrna_program, ncrna_program, bakta_version)`

Writes the bakta outputs to a given path.

Parameters:

Name	Type	Description	Default
`data`	`dict`	The dictionary containing the bakta outputs.	required
`features`	`Sequence[dict]`	The sequence of dictionaries containing the features.	required
`features_by_sequence`	`Sequence[dict]`	The sequence of dictionaries containing the features by sequence.	required
`output`	`Path`	The path to save the bakta outputs to.	required
`prefix`	`str`	The prefix to use for the bakta outputs.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`euk`	`bool`	A boolean indicating whether the sequences are eukaryotic.	required
`has_duplicate_locus`	`bool`	A boolean indicating whether there are duplicate loci.	required
`fast`	`bool`	If True, skips AFDB step	required
`translation_table`	`str`	Translation table inferred from input JSON	required
`prokka`	`bool`	boolean indicating if prokka was used to do initial annotation	required
`other_genbank`	`bool`	boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation	required
`bakta_version`	`dict`	Dictionary of Bakta (or whatever other program) was used for the initial annotation	required

Returns:

Type	Description
	None.

Examples:

>>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)

Source code in src/baktfold/io/io.py

def write_bakta_outputs(data: dict, features: Sequence[dict], features_by_sequence: Sequence[dict] , 
                        output: Path, prefix: str, custom_db: bool, euk: bool, has_duplicate_locus: bool,
                        fast: bool, translation_table: int, prokka: bool, other_genbank: bool,
                        cds_program: str ,trna_program: str, rrna_program: str, tmrna_program: str, ncrna_program: str, bakta_version: dict):
    """
    Writes the bakta outputs to a given path.

    Args:
      data (dict): The dictionary containing the bakta outputs.
      features (Sequence[dict]): The sequence of dictionaries containing the features.
      features_by_sequence (Sequence[dict]): The sequence of dictionaries containing the features by sequence.
      output (Path): The path to save the bakta outputs to.
      prefix (str): The prefix to use for the bakta outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      euk (bool): A boolean indicating whether the sequences are eukaryotic.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): If True, skips AFDB step
      translation_table (str): Translation table inferred from input JSON
      prokka (bool): boolean indicating if prokka was used to do initial annotation
      other_genbank (bool): boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation
      bakta_version (dict): Dictionary of Bakta (or whatever other program) was used for the initial annotation

    Returns:
      None.

    Examples:
      >>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
    """

    #logger.info(f'selected features={len(features)}')

    logger.info('writing human readable TSV...')
    tsv_path: Path = Path(output) / f"{prefix}.tsv"
    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)

    logger.info('writing GFF3...')
    gff3_path: Path = Path(output) / f"{prefix}.gff3"
    # fix later prokka
    gff.write_features(data, features_by_sequence, gff3_path, prokka, euk, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing INSDC GenBank & EMBL...')
    genbank_path: Path = Path(output) / f"{prefix}.gbff"
    embl_path: Path = Path(output) / f"{prefix}.embl"
    # arg order must match write_features(...): translation_table BEFORE other_genbank.
    insdc.write_features(data, features, genbank_path, embl_path, prokka, euk, translation_table, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing genome sequences...')
    fna_path: Path = Path(output) / f"{prefix}.fna"
    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)

    logger.info('writing feature nucleotide sequences...')
    ffn_path: Path = Path(output) / f"{prefix}.ffn"
    fasta.write_ffn(features, ffn_path)

    logger.info('writing translated CDS sequences...')
    faa_path: Path = Path(output) / f"{prefix}.faa"
    fasta.write_faa(features, faa_path)

    # inference here is the different databases?
    annotations_path: Path = Path(output) / f"{prefix}.inference.tsv"
    if custom_db:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    # Remove 'AFDBClusters' if fast is True
    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # annotation confidence column goes right after Product
    prod_idx = header_columns.index('Product')
    header_columns = header_columns[:prod_idx + 1] + ['Annotation_Confidence'] + header_columns[prod_idx + 1:]

    # flatten all features across sequences
    all_features = [
        feat
        for features in features_by_sequence.values()
        for feat in features
    ]

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')

    selected_features = []


    for seq_id, features in features_by_sequence.items():
        for feat in features:
            # get() ensures we don't crash if the key doesn't exist
            if 'hypothetical' in feat or 'baktfold' in feat:
                selected_features.append(feat)

    # structure input carries Foldseek TM-score/LDDT on its hits -> extra columns
    structures = any('tmscore' in feat for feat in selected_features)
    if structures:
        header_columns = header_columns + ['TMscore', 'LDDT']

    tsv.write_protein_features(selected_features, header_columns, annotations_path, custom_db, has_duplicate_locus, fast=fast, structures=structures)

    # write summary file

    write_summary_txt_file(output, prefix, all_features)

    logger.info('write machine readable JSON...')
    json_path: Path = Path(output) / f"{prefix}.json"
    # Provenance block: lets ``baktfold json`` reconstitute these outputs later
    # without re-supplying runtime flags. See io/json.py:write_json.
    baktfold_run = {
        'mode': 'genome',
        'euk': euk,
        'custom_db': custom_db,
        'fast': fast,
        'has_duplicate_locus': has_duplicate_locus,
        'translation_table': translation_table,
        'prokka': prokka,
        'other_genbank': other_genbank,
        'cds_program': cds_program,
        'trna_program': trna_program,
        'rrna_program': rrna_program,
        'tmrna_program': tmrna_program,
        'ncrna_program': ncrna_program,
    }
    json.write_json(data, features, json_path, bakta_version, baktfold_run)

`write_bakta_proteins_outputs(aas, output, prefix, custom_db, fast, bakta_version)`

Writes the bakta protein outputs to a given path.

Parameters:

Name	Type	Description	Default
`aas`	`Sequence[dict]`	The sequence of dictionaries containing the amino acids.	required
`output`	`Path`	The path to save the bakta protein outputs to.	required
`prefix`	`str`	The prefix to use for the bakta protein outputs.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`fast`	`bool`	If True, skips AFDB step	required
`bakta_version`	`dict`	Original Bakta version	required

Returns:

Type	Description
	None.

Examples:

>>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)

Source code in src/baktfold/io/io.py

def write_bakta_proteins_outputs(aas: Sequence[dict], output: Path, prefix: str, custom_db: bool, fast: bool, bakta_version: dict):
    """
    Writes the bakta protein outputs to a given path.

    Args:
      aas (Sequence[dict]): The sequence of dictionaries containing the amino acids.
      output (Path): The path to save the bakta protein outputs to.
      prefix (str): The prefix to use for the bakta protein outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      fast (bool): If True, skips AFDB step
      bakta_version (dict): Original Bakta version

    Returns:
      None.

    Examples:
      >>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
    """

    # remove fields that were mocked to avoid baktfold crashing but not in the bakta protein JSON outputs
    fields_to_remove = ['sequence', 'start', 'stop', 'strand', 'frame']

    for aa in aas:
        for f in fields_to_remove:
            aa.pop(f, None)

    annotations_path: Path = Path(output) / f"{prefix}.tsv"
    if custom_db:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # annotation confidence column goes right after Product
    prod_idx = header_columns.index('Product')
    header_columns = header_columns[:prod_idx + 1] + ['Annotation_Confidence'] + header_columns[prod_idx + 1:]

    # structure input carries Foldseek TM-score/LDDT on its hits -> extra columns
    structures = any('tmscore' in aa for aa in aas)
    if structures:
        header_columns = header_columns + ['TMscore', 'LDDT']

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')
    tsv.write_protein_features(aas, header_columns, annotations_path, custom_db, has_duplicate_locus=False, fast=fast, structures=structures)


    # do i combine the tophits tsvs, sort by column, add a column for db and put out as one tsv

    full_annotations_path: Path = Path(output) / f"{prefix}.json"
    logger.info(f'Full annotations (JSON): {full_annotations_path}')
    # Provenance block for ``baktfold json`` (proteins mode). See io/json.py.
    baktfold_run = {
        'mode': 'proteins',
        'custom_db': custom_db,
        'fast': fast,
    }
    json.write_json({'features': aas}, aas, full_annotations_path, bakta_version, baktfold_run)


    #### don't write hyps I think as tsv

    # hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
    # header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
    # hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    # print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}')
    # tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)

    aa_output_path: Path = Path(output) / f"{prefix}.faa"
    logger.info(f'Annotated sequences (Fasta): {aa_output_path}')
    fasta.write_faa(aas, aa_output_path)

    write_summary_txt_file(output, prefix, aas)

`write_foldseek_tophit(tophit_df, pdb_tophit_path)`

Writes the foldseek tophits to a given path.

Parameters:

Name	Type	Description	Default
`tophit_df`	`pl.DataFrame`	The dataframe containing the foldseek tophits.	required
`pdb_tophit_path`	`Path`	The path to save the foldseek tophits to.	required

Returns:

Type	Description
	None.

Examples:

>>> write_foldseek_tophit(tophit_df, pdb_tophit_path)

Source code in src/baktfold/io/io.py

def write_foldseek_tophit(tophit_df: pl.DataFrame, pdb_tophit_path: Path):
    """
    Writes the foldseek tophits to a given path.

    Args:
      tophit_df (pl.DataFrame): The dataframe containing the foldseek tophits.
      pdb_tophit_path (Path): The path to save the foldseek tophits to.

    Returns:
      None.

    Examples:
      >>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
    """
    logger.info(f"Saving foldseek tophits to {pdb_tophit_path}")
    tophit_df.write_csv(pdb_tophit_path, separator="\t")

`map_aa_columns(feat, custom_db, has_duplicate_locus, fast, structures=False)`

Maps amino acid columns.

Parameters:

Name	Type	Description	Default
`feat`	`dict`	The dictionary containing the features.	required
`custom_db`	`bool`	A boolean indicating whether a custom database is used.	required
`has_duplicate_locus`	`bool`	A boolean indicating whether there are duplicate loci.	required
`fast`	`bool`	A boolean indicating whether AFDBclusters Foldseek search should be skipped	required

Returns:

Type	Description
`Sequence[str]`	Sequence[str]: A sequence of strings containing the mapped amino acid columns.

Examples:

>>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
['ABC', '100', 'protein', '', '', '', '']

Source code in src/baktfold/io/tsv.py

def map_aa_columns(feat: dict, custom_db: bool, has_duplicate_locus: bool, fast: bool, structures: bool = False) -> Sequence[str]:
    """
    Maps amino acid columns.

    Args:
      feat (dict): The dictionary containing the features.
      custom_db (bool): A boolean indicating whether a custom database is used.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): A boolean indicating whether AFDBclusters Foldseek search should be skipped

    Returns:
      Sequence[str]: A sequence of strings containing the mapped amino acid columns.

    Examples:
      >>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
      ['ABC', '100', 'protein', '', '', '', '']
    """
    # Ensure length exists
    if 'length' not in feat:
        feat['length'] = int(len(feat['nt']) / 3)

    xrefs = feat.get('db_xrefs', [])

    # Extract dbxref groups once
    def join_filtered(prefix: str, replacement: str = None):
        """
    Joins filtered database cross-references.

    Args:
      prefix (str): The prefix to filter by.
      replacement (str): The string to replace the prefix with. Defaults to None.

    Returns:
      str: The joined filtered database cross-references.

    Examples:
      >>> join_filtered('swissprot', 'afdb_v6:')
      'afdb_v6:'
    """
        if replacement is None:
            replacement = prefix
        return ','.join(
            db.replace(replacement, '') for db in xrefs
            if prefix in db
        )

    swissprot   = join_filtered('swissprot', 'afdb_v6:')
    afdbclust   = join_filtered('afdbclusters_', 'afdb_v6:')
    pdb         = join_filtered('pdb:')
    cath        = join_filtered('cath:')
    custom_refs = join_filtered('custom:', 'custom:custom_')

    # Build the output row
    row = [feat['locus']]

    # add id if multiple CDS per Locus in that record (euks)
    if has_duplicate_locus:
        row.append(feat['id'])

    row.extend([
        str(feat['length']),
        feat['product'],
        feat.get('annotation_confidence', ''),
        swissprot,
    ])

    # Only add AFDBClusters if not in fast mode
    if not fast:
        row.append(afdbclust)

    # Always add these
    row.extend([
        pdb,
        cath,
    ])

    if custom_db:
        row.append(custom_refs)

    # structure input also reports Foldseek TM-score and LDDT
    if structures:
        row.extend([
            '' if feat.get('tmscore') is None else str(feat['tmscore']),
            '' if feat.get('lddt') is None else str(feat['lddt']),
        ])

    return row

`write_feature_inferences(sequences, features_by_sequence, tsv_path)`

Export feature inference statistics in TSV format.

Source code in src/baktfold/io/tsv.py

def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export feature inference statistics in TSV format."""
    logger.info('write tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                    if('ups' in feat or 'ips' in feat):
                        query_cov = 1
                        subject_cov = 1
                        identity = 1
                        evalue = 0
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][DB_UPS_COL_UNIPARC]}"
                    elif('psc' in feat or 'pscc' in feat):
                        psc_type = 'psc' if 'psc' in feat else 'pscc'
                        query_cov = feat[psc_type]['query_cov']
                        subject_cov = feat[psc_type].get('subject_cov', -1)
                        identity = feat[psc_type]['identity']
                        score = feat[psc_type].get('score', -1)
                        evalue = feat[psc_type].get('evalue', -1)
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][DB_PSCC_COL_UNIREF50]}"
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'],
                            f"{score:0.1f}" if score != None else '-',
                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'] if 'locus' in feat else '-',
                            f"{feat['score']:0.1f}",
                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
    return

`write_features(sequences, features_by_sequence, tsv_path)`

Export features in TSV format.

Source code in src/baktfold/io/tsv.py

def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export features in TSV format."""
    logger.info(f'write feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                feat_type = feat['type']
                if(feat_type == bc.FEATURE_GAP):
                    feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP

                gene = feat['gene'] if feat.get('gene', None) else ''
                product = feat.get('product', '')
                if(bc.PSEUDOGENE in feat):
                    product = f"(pseudo) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_5_PRIME):
                    product = f"(5' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_3_PRIME):
                    product = f"(3' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_BOTH):
                    product = f"(partial) {product}"

                def s(x):
                    return '' if x is None else str(x)

                fh.write('\t'.join(
                    [
                        seq_id,
                        feat_type,
                        str(feat['start']),
                        str(feat['stop']),
                        str(feat['strand']),
                        s(feat.get('locus')), # handles None → ''
                        s(gene),        # handles None → ''
                        s(product),     # handles None → ''
                        ', '.join(sorted(feat.get('db_xrefs', [])))
                    ])
                )
                fh.write('\n')
                if(feat_type == bc.FEATURE_CRISPR):
                    i = 0
                    # spacers and repeats wont exist if Prokka input
                    spacers = feat.get('spacers', [])
                    repeat = feat.get('repeat', [])

                    if len(spacers) > 0 and len(repeat) > 0: 
                    # if not - will just skip
                        while i < len(feat['spacers']):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
                            spacer = feat['spacers'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                            fh.write('\n')
                            i += 1
                        if(len(feat['repeats']) - 1 == i):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
    return

`write_hypotheticals(hypotheticals, tsv_path)`

Export hypothetical information in TSV format.

Source code in src/baktfold/io/tsv.py

def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
    """Export hypothetical information in TSV format."""
    logger.info('write hypothetical tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold v{cfg.version}, https://github.com/oschwengers/bakta\n')
        #fh.write(f"#Database v{cfg.db_info['major']}.{cfg.db_info['minor']}, https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('#Sequence Id\tStart\tStop\tStrand\tLocus Tag\tMol Weight [kDa]\tIso El. Point\tPfam hits\tDbxrefs\n')
        for hypo in hypotheticals:
            pfams = [f"{pfam['id']}|{pfam['name']}" for pfam in hypo.get('pfams', [])]
            seq_stats = hypo['seq_stats']
            mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA'
            iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA'
            seq_id = hypo['sequence'] if 'sequence' in hypo else hypo['contig']  # <1.10.0 compatibility
            fh.write(f"{seq_id}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
    return

`write_protein_features(features, header_columns, tsv_path, custom_db, has_duplicate_locus, fast, structures=False)`

Export protein features in TSV format.

Source code in src/baktfold/io/tsv.py

def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], tsv_path: Path, custom_db: bool, has_duplicate_locus: bool, fast: bool, structures: bool = False):
    """Export protein features in TSV format."""
    logger.info(f'write protein feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold (v{cfg.version}): https://github.com/gbouras13/baktfold\n')
        #fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('\t'.join(header_columns))
        fh.write('\n')
        for feat in features:
            columns = map_aa_columns(feat, custom_db, has_duplicate_locus, fast, structures)
            fh.write('\t'.join(columns))
            fh.write('\n')
    return

`parse_protein_input(input_path, faa_path)`

handles regular FASTA and gzipped returns cds_dict

Source code in src/baktfold/io/fasta_in.py

def parse_protein_input(input_path, faa_path):
    """
    handles regular FASTA and gzipped 
    returns cds_dict
    """

    # handles regular FASTA and gzipped
    fasta_flag = False
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        input_path = Path(input_path).resolve()
        fasta_flag = is_fasta(input_path)
        if fasta_flag:
            logger.info('FASTA input format detected.')
        else:
            logger.info('Bakta JSON input format detected. Only hypothetical proteins from the Bakta JSON input file will be annotated.')
    except Exception as e:
        logger.error(f'ERROR: annotation file {input_path} not valid! {e}')
        sys.exit(1)

    aas = []
    hypotheticals = []
    bakta_version = {}
    try:
        if fasta_flag:
            logger.info('Attempting to parse input protein sequences as .faa format ...')
            aas = fasta.import_sequences(input_path, False, False)
        else:
            logger.info('Attempting to parse input protein sequences as Bakta JSON format ...')
            aas, hypotheticals, bakta_version = parse_json_input(input_path, False, False, protein_json_flag=True)
        logger.info(f'Imported sequences={len(aas)}')
    except Exception as e:
        logger.error(f'ERROR: wrong file format or unallowed characters in amino acid sequences! {e}')
        sys.exit(1)

    mock_start = 1
    for aa in aas:  # rename and mock feature attributes to reuse existing functions
        aa['type'] = bc.FEATURE_CDS
        aa['locus'] = aa['id']
        aa['sequence'] = '-'
        aa['start'] = mock_start
        aa['stop'] = mock_start + aa['length'] - 1
        aa['strand'] = bc.STRAND_UNKNOWN
        aa['frame'] = 1
        mock_start += 100


    if fasta_flag:
        with faa_path.open('wt') as fh:
            for aa in aas:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")
    else: # write hypothetical proteins to file if JSON input
        with faa_path.open('wt') as fh:
            for aa in hypotheticals:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")

    logger.info('Parsing complete')

    return aas, bakta_version

`encode_annotations(annotations)`

Encodes annotations into a string.

Parameters:

Name	Type	Description	Default
`annotations`	`dict`	A dictionary containing the annotations.	required

Returns:

Name	Type	Description
`str`	`str`	The encoded annotations.

Examples:

>>> encode_annotations({
    'ID': 'EHICP_3230_sigpep',
    'Name': 'signal peptide',
    'product': 'signal peptide',
    'score': 0.5,
    'Parent': 'EHICP_3230'
})
'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'

Source code in src/baktfold/io/gff.py

def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str:
    """
    Encodes annotations into a string.

    Args:
      annotations (dict): A dictionary containing the annotations.

    Returns:
      str: The encoded annotations.

    Examples:
      >>> encode_annotations({
          'ID': 'EHICP_3230_sigpep',
          'Name': 'signal peptide',
          'product': 'signal peptide',
          'score': 0.5,
          'Parent': 'EHICP_3230'
      })
      'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
    """
    annotation_strings = []
    for key, val in annotations.items():
        if(type(val) is list):
            if(len(val) >= 1):
                val = [encode_attribute(k) for k in val]
                annotation = f"{key}={','.join(val)}"
                annotation_strings.append(annotation)
        else:
            annotation_strings.append(f'{key}={encode_attribute(val)}')
    return ';'.join(annotation_strings)

`encode_attribute(product)`

Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md

Source code in src/baktfold/io/gff.py

def encode_attribute(product: str) -> str:
    """Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md"""
    product = str(product)
    product = product.replace('%', '%25')
    product = product.replace(';', '%3B')
    product = product.replace('=', '%3D')
    product = product.replace('&', '%26')
    product = product.replace(',', '%2C')
    return product

`write_euk_cds_feature(fh, seq_id, feat)`

Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

Parameters

fh : file-handle seq_id : str

dict-like feature with keys:

"start", "stop", "strand", "locus", "starts", "stops"

Source code in src/baktfold/io/gff.py

def write_euk_cds_feature(fh, seq_id, feat):
    """
    Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

    Parameters
    ----------
    fh : file-handle
    seq_id : str
    feat : dict-like feature with keys:
            "start", "stop", "strand", "locus", "starts", "stops"
    """

    strand = feat.get("strand", "+")
    locus = feat.get("locus", "unknown")

    transcript_id = f"{locus}-T1"
    cds_id = f"{transcript_id}.cds"

    starts = feat.get("starts")
    stops = feat.get("stops")

    # -------------------------------
    # 1. Determine CDS sub-coordinates
    # -------------------------------
    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        cds_coords = list(zip(starts, stops))
    else:
        cds_coords = [(feat["start"], feat["stop"])]

    # -------------------------------
    # 2. Reverse order for negative strand
    # -------------------------------
    if strand == "-":
        cds_coords.reverse()

    # -------------------------------
    # 3. Emit CDS lines with correct phase
    # -------------------------------
    offset = 0

    for i, (cds_start, cds_stop) in enumerate(cds_coords, start=1):

        length = cds_stop - cds_start + 1
        phase = offset % 3
        offset += length

        attr = f"ID={cds_id}-{i};Parent={transcript_id}"

        fh.write(
            f"{seq_id}\tbaktfold\tCDS\t{cds_start}\t{cds_stop}"
            f"\t.\t{strand}\t{phase}\t{attr}\n"
        )

`write_euk_repeat_region_feature(fh, seq_id, feat)`

Writes a repeat region feature to a file.

Parameters:

Name	Type	Description	Default
`fh`	`file`	The file handle to write to.	required
`seq_id`	`str`	The sequence ID.	required
`feat`	`dict`	A dictionary containing the feature information.	required

Returns:

Type	Description
	None

Examples:

>>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
    "type": "repeat_region",
    "sequence": "DS571531.1",
    "start": 1470,
    "stop": 1716,
    "strand": "?",
    "family": "LINE2",
    "rpt_type": null,
    "repeat_unit": null,
    "product": null,
    "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
    "id": "EHICP_3230",
    "db_xrefs": [
        "SO:0000657"
    ]
})

Source code in src/baktfold/io/gff.py

def write_euk_repeat_region_feature(fh, seq_id, feat):
    """
    Writes a repeat region feature to a file.

    Args:
      fh (file): The file handle to write to.
      seq_id (str): The sequence ID.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
          "type": "repeat_region",
          "sequence": "DS571531.1",
          "start": 1470,
          "stop": 1716,
          "strand": "?",
          "family": "LINE2",
          "rpt_type": null,
          "repeat_unit": null,
          "product": null,
          "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
          "id": "EHICP_3230",
          "db_xrefs": [
              "SO:0000657"
          ]
      })
    """

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    id = feat['sequence']

    attrs = {
        "ID": f"{id}:{start}..{stop}",
        "gbkey": "repeat_region"
    }

    if feat.get('family') is not None:
        attrs["rpt_family"] = feat.get('family')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\trepeat_region\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_euk_trna_feature(fh, seq_id, feat)`

Write a tRNA feature to GFF3 with a top-level line and single exon.

Parameters

file-like

Open file handle to write GFF lines.

str

Sequence/contig ID.

SeqFeature

Biopython SeqFeature object of type 'tRNA'.

Notes

Generates one tRNA line and one exon line.
Includes optional 'product' qualifier.

Source code in src/baktfold/io/gff.py

def write_euk_trna_feature(fh, seq_id, feat):
    """
    Write a tRNA feature to GFF3 with a top-level line and single exon.

    Parameters
    ----------
    fh : file-like
        Open file handle to write GFF lines.
    seq_id : str
        Sequence/contig ID.
    feat : SeqFeature
        Biopython SeqFeature object of type 'tRNA'.

    Notes
    -----
    - Generates one tRNA line and one exon line.
    - Includes optional 'product' qualifier.
    """
    start = int(feat['start'])
    stop  = int(feat['stop'])

    strand = feat['strand']

    locus = feat['locus']

    trna_id = f"{locus}-T1"

    # Top-level tRNA attributes
    attrs = {
        "ID": trna_id,
        "Parent": locus
    }

    attrs = {}

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    # Write top-level tRNA line
    fh.write(f"{seq_id}\tbaktfold\ttRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    # Write exon line (tRNA single-exon)
    exon_id = f"{trna_id}.exßon1"
    exon_attrs = f"ID={exon_id};Parent={trna_id}"
    fh.write(f"{seq_id}\tbaktfold\texon\t{start}\t{stop}\t.\t{strand}\t.\t{exon_attrs}\n")

`write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)`

Write a 'utr' feature.

Source code in src/baktfold/io/gff.py

def write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False):
    """Write a 'utr' feature."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    locus = feat['locus']

    # Count occurrences for this locus
    count = locus_counter.get(locus, 0) + 1
    locus_counter[locus] = count

    # Construct ID with suffix -2, -3, etc.
    # For first entry we keep ID=locus (no -1)
    if count == 1:
        utr_id = locus
    else:
        utr_id = f"{locus}-{count}"

    # Top-level mRNA line
    attrs = {
        "ID": f"{utr_id}",
        "Parent": f"{locus}",
    }

# CAMXCT020000566.1	EMBL	three_prime_UTR	84568	84617	.	-	.	ID=id-C1SCF055_LOCUS8420;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr3p1%3B~source:feature;gbkey=3'UTR;locus_tag=C1SCF055_LOCUS8420
# CAMXCT020000566.1	EMBL	five_prime_UTR	136251	136259	.	-	.	ID=id-C1SCF055_LOCUS8420-2;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr5p1%3B~source:feature;gbkey=5'UTR;locus_tag=C1SCF055_LOCUS8420

    if feat.get('Note') is not None:
        attrs["Note"] = feat.get('note')


    attrs["gbkey"] = "3'UTR" if three else "5'UTR"

    if feat.get('Note') is not None:
        attrs["locus_tag"] = feat.get('locus')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    if three:
        gene_tag = 'three_prime_UTR'
    else:
        gene_tag = 'five_prime_UTR'

    fh.write(f"{seq_id}\tbaktfold\t{gene_tag}\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_features(data, features_by_sequence, gff3_path, prokka=False, euk=False, other_genbank=False, cds_tool='Prodigal:2.6', trna_program='tRNAscan-SE:2.0.12', tmrna_program='Aragorn', rrna_program='Infernal', ncrna_program='Infernal')`

Export features in GFF3 format.

Source code in src/baktfold/io/gff.py

def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path, prokka: bool = False, euk: bool = False, other_genbank: bool = False, cds_tool: str = "Prodigal:2.6", trna_program: str = "tRNAscan-SE:2.0.12", tmrna_program: str = "Aragorn", rrna_program: str = "Infernal", ncrna_program: str = "Infernal"):
    """Export features in GFF3 format."""
    logger.info(f'write features: path={gff3_path}')

    with gff3_path.open('wt') as fh:
        fh.write('##gff-version 3\n')  # GFF version
        fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version

        if(data['genome'].get('taxon', None)):  # write organism info
            fh.write(f"# organism {data['genome']['taxon']}\n")

        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')

        for seq in data['sequences']:  # write features
            if euk:
                locus_counter = {} # for UTRs

            fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region

            # write landmark region
            annotations = {
                'ID': seq['id'],
                'Name': seq['id']
            }
            if(seq['topology'] == bc.TOPOLOGY_CIRCULAR):
                annotations['Is_circular'] = 'true'
            annotations = encode_annotations(annotations)
            fh.write(f"{seq['id']}\tBaktfold\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")

            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                start = feat['start']
                stop = feat['stop']
                if('edge' in feat):
                    stop += seq['length']

                # euks
                if euk:
                    if(feat['type'] == bc.FEATURE_REPEAT):
                        write_euk_repeat_region_feature(fh, seq_id, feat)

                    if(feat['type'] == bc.FEATURE_5UTR or feat['type'] == bc.FEATURE_3UTR):
                        if feat['type'] == bc.FEATURE_3UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=True)
                        elif feat['type'] == bc.FEATURE_5UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

                if(feat['type'] == bc.FEATURE_T_RNA):

                    if euk:
                        write_euk_trna_feature(fh, seq_id, feat)
                    else:
                        trna_tool = "tRNAscan-SE"
                        if prokka:
                            trna_tool = "Aragorn"
                        if other_genbank:
                            trna_tool = trna_program

                        annotations = {
                            'ID': feat['locus'],
                            'Name': feat['product'],
                            'locus_tag': feat['locus'],
                            'product': feat['product'],
                            'Dbxref': feat.get('db_xrefs', [])
                        }
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('anti_codon', False)):
                            annotations['anti_codon'] = feat['anti_codon']
                        if(feat.get('amino_acid', False)):
                            annotations['amino_acid'] = feat['amino_acid']
                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'profile:tRNAscan:2.0'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{trna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{trna_tool}\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_TM_RNA):
                    # both prokka and bakta use Aragorn

                    tmrna_tool = "Aragorn"
                    if other_genbank:
                        tmrna_tool = tmrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('tag' in feat):
                        annotations['tag_peptide'] = feat['tag']['aa']
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['inference'] = 'profile:aragorn:1.2'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        if('tag' in feat):
                            annotations['tag_peptide'] = f"{feat['tag']['start']}..{feat['tag']['stop']}" if feat['strand'] == bc.STRAND_FORWARD else f"complement({feat['tag']['start']}..{feat['tag']['stop']})"
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_R_RNA):
                    rrna_tool = "Infernal"
                    if prokka:
                        rrna_tool = "barrnap"
                    if other_genbank:
                        rrna_tool = rrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{rrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA):
                    # both prokka and bakta use infernal for ncrna
                    ncrna_tool = "Infernal"
                    if other_genbank:
                        ncrna_tool = ncrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_NC_RNA_CLASS] = insdc.select_ncrna_class(feat)
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if(ba.RE_GENE_SYMBOL.fullmatch(feat['gene'])):  # discard non-standard ncRNA gene symbols
                            gene_annotations['gene'] = feat['gene']
                        else:
                            annotations.pop('gene', None)
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{ncrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_CRISPR):
                    crispr_tool = "PILER-CR"
                    if prokka:
                        crispr_tool = "MinCED"
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product']
                    }
                    feat_type = so.SO_CRISPR.name
                    if(cfg.compliant):
                        feat_type = bc.INSDC_FEATURE_REPEAT_REGION
                        annotations['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                        annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                        annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{crispr_tool}\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                    if(not cfg.compliant):
                        i = 0
                        # spacers and repeats wont exist if Prokka input
                        spacers = feat.get('spacers', [])
                        repeat = feat.get('repeat', [])
                        if len(spacers) > 0 and len(repeat) > 0: 
                            while i < len(feat['spacers']):
                                repeat = feat['repeats'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_repeat_{i+1}",
                                    'Parent': feat['id']
                                }
                                annotations = encode_annotations(annotations)
                                # will always be PILER here as prokka won't have any
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                                spacer = feat['spacers'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_spacer_{i+1}",
                                    'Parent': feat['id'],
                                    'sequence': spacer['sequence']
                                }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                                i += 1
                            if(len(feat['repeats']) - 1 == i):
                                repeat = feat['repeats'][i]
                                annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                elif feat['type'] == bc.FEATURE_CDS:
                    if euk:
                        write_euk_cds_feature(fh, seq_id, feat)
                    else:
                        annotations = {
                            'ID': feat.get('locus'),
                            'Name': feat.get('product'),
                            'locus_tag': feat.get('locus'),
                            'product': feat.get('product'),
                            'Dbxref': feat.get('db_xrefs', [])  # default to empty list if db_xrefs doesn't exist
                        }
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Pyrodigal'
                        if prokka: 
                            source = 'Prodigal'

                        if other_genbank:
                            source = cds_tool

                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'ab initio prediction:Pyrodigal:3.5'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                            if(ec_number is not None):
                                annotations['ec_number'] = ec_number
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        if('exception' in feat):
                            ex = feat['exception']
                            pos = f"{ex['start']}..{ex['stop']}"
                            if(feat['strand'] == bc.STRAND_REVERSE):
                                pos = f"complement({pos})"
                            annotations['transl_except']=f"(pos:{pos},aa:{ex['aa']})"
                            notes = annotations.get('Note', [])
                            notes.append(f"codon on position {ex['codon_position']} is a {ex['type']} codon")
                            if('Notes' not in annotations):
                                annotations['Note'] = notes
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                        if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                            write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_SORF):
                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if(feat.get('gene', None)):  # add gene annotation if available
                        annotations['gene'] = feat['gene']
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                        if(ec_number is not None):
                            annotations['ec_number'] = ec_number
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'inference': 'ab initio prediction:Bakta'
                        }
                        if(feat.get('gene', None)):
                            gene_annotations['gene'] = feat['gene']
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                        write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_GAP):
                    gap_tool="Bakta"
                    if prokka:
                        gap_tool="Prokka"
                    annotations = {
                        'ID': feat['id'],
                        'Name': f"gap ({feat['length']} bp)",
                        'product': f"gap ({feat['length']} bp)"
                    }
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{gap_tool}\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIC):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIV):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIT):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_GENE):
                    write_gene_feature(fh, seq_id, feat)
                elif(feat['type'] == bc.FEATURE_MRNA):
                    write_mrna_feature(fh, seq_id, feat)

        if(not cfg.compliant):
            fh.write('##FASTA\n')
            for seq in data['sequences']:  # write sequences
                fh.write(f">{seq['id']}\n")
                seq_nt = seq['nt'] if 'nt' in seq else seq['sequence']  # <1.10.0 compatibility
                fh.write(fasta.wrap_sequence(seq_nt))
    return

`write_gene_feature(fh, seq_id, feat)`

Write a 'gene' feature including fuzzy boundaries.

Source code in src/baktfold/io/gff.py

def write_gene_feature(fh, seq_id, feat):
    """Write a 'gene' feature including fuzzy boundaries."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    attrs = {
        "ID": f"{locus}"
    }

    if feat.get('gene') is not None:
        attrs["Name"] = feat.get('gene')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tgene\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

`write_mrna_feature(fh, seq_id, feat)`

Write mRNA + implied exons based on join() structure.

Source code in src/baktfold/io/gff.py

def write_mrna_feature(fh, seq_id, feat):
    """Write mRNA + implied exons based on join() structure."""

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    mrna_id = f"{locus}-T1"

    # Top-level mRNA line
    attrs = {
        "ID": mrna_id,
        "Parent": f"{locus}",
    }

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    # Ensure db_xrefs exists and is a list
    db_xrefs = feat.get("db_xrefs", [])

    # Access note safely
    note = feat.get("note", None)


    if db_xrefs:

        key = "Dbxref"         
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)

    if note:

        key = "note"         # <-- you must define this
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tmRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    starts = feat.get("starts")
    stops  = feat.get("stops")
    strand = feat.get("strand")
    seq_id = feat.get("sequence")

    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        # For minus strand, exons must be written in reverse order (5'→3')
        if strand == "-":
            exon_parts = list(zip(starts, stops))
        else:
            exon_parts = list(zip(starts, stops))

        # Exons must be numbered in biological order (5' to 3')
        if strand == "-":
            exon_parts = exon_parts[::-1]   # reverse order

        # Write each exon to GFF
        for idx, (ex_start, ex_stop) in enumerate(exon_parts, start=1):
            exon_id = f"{mrna_id}.exon{idx}"
            exon_attrs = f"ID={exon_id};Parent={mrna_id}"
            fh.write(
                f"{seq_id}\tbaktfold\texon\t{ex_start}\t{ex_stop}\t.\t{strand}\t.\t{exon_attrs}\n"
            )
    else:
        # Single exon (no starts/stops provided)
        exon_start = feat["start"]
        exon_stop = feat["stop"]
        exon_id = f"{mrna_id}.exon1"
        exon_attrs = f"ID={exon_id};Parent={mrna_id}"

        fh.write(
            f"{seq_id}\tbaktfold\texon\t{exon_start}\t{exon_stop}"
            f"\t.\t{feat['strand']}\t.\t{exon_attrs}\n"
        )

`write_signal_peptide(fh, feat)`

Writes a signal peptide feature to a file.

Parameters:

Name	Type	Description	Default
`fh`	`file`	The file handle to write to.	required
`feat`	`dict`	A dictionary containing the feature information.	required

Returns:

Type	Description
	None

Examples:

>>> write_signal_peptide(fh, {
    'locus': 'EHICP_3230',
    'sequence': 'DS571531.1',
    'strand': '+',
    'signal_peptide': {
        'start': 1,
        'stop': 20,
        'score': 0.5
    }
})

Source code in src/baktfold/io/gff.py

def write_signal_peptide(fh, feat: dict):  # <1.10.0 compatibility
    """
    Writes a signal peptide feature to a file.

    Args:
      fh (file): The file handle to write to.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_signal_peptide(fh, {
          'locus': 'EHICP_3230',
          'sequence': 'DS571531.1',
          'strand': '+',
          'signal_peptide': {
              'start': 1,
              'stop': 20,
              'score': 0.5
          }
      })
    """
    sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
    annotations = {
        'ID': f"{feat['locus']}_sigpep",
        'Name': 'signal peptide',
        'product': 'signal peptide',
        'score': sig_peptide['score'],
        'Parent': feat['locus']
    }
    annotations = encode_annotations(annotations)
    seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
    fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")

`export_sequences(sequences, fasta_path, description=False, wrap=False)`

Write sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
    """Write sequences to Fasta file."""
    logger.info(f'write genome sequences: path={fasta_path}, description={description}, wrap={wrap}')

    with fasta_path.open('wt') as fh:
        for seq in sequences:
            if(description):
                fh.write(f">{seq['id']} {seq['description']}\n")
            else:
                fh.write(f">{seq['id']}\n")
            if(wrap):
                fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence']))  # <1.10.0 compatibility
            else:
                fh.write(seq['nt'])
                fh.write('\n')

`import_sequences(sequences_path, is_genomic=True, is_dna=True)`

Import raw sequences from Fasta file.

Source code in src/baktfold/io/fasta.py

def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
    """Import raw sequences from Fasta file."""
    sequences = []
    with xopen(str(sequences_path), threads=0) as fh:
        for record in SeqIO.parse(fh, 'fasta'):

            rid = record.id

            if "~PIPE~" in rid:
                logger.error(
                    f"Your proteins FASTA header has ~PIPE~ in the header"
                    "Please remove all instances of ~PIPE~ before running Baktfold as this creates downstream issues with Foldseek"
                )
            else:
                rid = rid.replace("|", "~PIPE~")

            sequence = {
                'id': rid,
                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else ''
            }

            raw_sequence = str(record.seq).upper()
            if('-' in raw_sequence):
                dash_count = raw_sequence.count('-')
                raw_sequence = raw_sequence.replace('-', '')
                logger.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count)
            if(is_dna):
                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                    raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
                sequence['nt'] = raw_sequence
            else:
                if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
                    raw_sequence = raw_sequence[:-1]
                    logger.warning('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence)
                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                    raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
                sequence['aa'] = raw_sequence
            sequence['length'] = len(raw_sequence)
            if(is_genomic):
                sequence['complete'] = False
                sequence['type'] = bc.REPLICON_CONTIG
                sequence['topology'] = bc.TOPOLOGY_LINEAR
            logger.info(
                f"imported: id={record.id}, length={sequence['length']}, description={sequence['description']}, genomic={is_genomic}, dna={is_dna}"
            )   
            sequences.append(sequence)
    return sequences

`wrap_sequence(sequence)`

Wraps a sequence into lines of 60 characters.

Parameters:

Name	Type	Description	Default
`sequence`	`str`	The sequence to wrap.	required

Returns:

Name	Type	Description
`str`		The wrapped sequence.

Examples:

>>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
'ARNDCQEGHILKMFPOSUTWYVBZXJ\n'

Notes

This function is used to format sequences in FASTA files.

Source code in src/baktfold/io/fasta.py

def wrap_sequence(sequence: str):
    """
    Wraps a sequence into lines of 60 characters.

    Args:
      sequence (str): The sequence to wrap.

    Returns:
      str: The wrapped sequence.

    Examples:
      >>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
      'ARNDCQEGHILKMFPOSUTWYVBZXJ\\n'

    Notes:
      This function is used to format sequences in FASTA files.
    """
    lines = []
    for i in range(0, len(sequence), FASTA_LINE_WRAPPING):
        lines.append(sequence[i:i + FASTA_LINE_WRAPPING])
    return '\n'.join(lines) + '\n'

`write_faa(features, faa_path)`

Write translated CDS sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def write_faa(features: Sequence[dict], faa_path: Path):
    """Write translated CDS sequences to Fasta file."""
    logger.info(f'write translated CDS/sORF: path={faa_path}')
    with faa_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF):
                fh.write(f">{feat['locus']} {feat['product']}\n{feat['aa']}\n")

`write_ffn(features, ffn_path)`

Write annotated nucleotide sequences to Fasta file.

Source code in src/baktfold/io/fasta.py

def write_ffn(features: Sequence[dict], ffn_path: Path):
    """Write annotated nucleotide sequences to Fasta file."""
    logger.info(f'write feature nucleotide sequences: path={ffn_path}')
    with ffn_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION, bc.FEATURE_CRISPR, bc.FEATURE_CDS, bc.FEATURE_SORF, bc.FEATURE_ORIC, bc.FEATURE_ORIV, bc.FEATURE_ORIT]):
                identifier = feat['locus'] if 'locus' in feat else feat['id']
                if(feat.get('product', '') != ''):
                    fh.write(f">{identifier} {feat['product']}\n{feat['nt']}\n")
                else:
                    fh.write(f">{identifier}\n{feat['nt']}\n")