Skip to content

Io

move_product_to_note_if_exists(qualifiers)

If a 'product' qualifier exists, append it to 'note' and remove 'product'.

Designed for the eukaryotic entries

Parameters

dict

Feature qualifiers dictionary (values are usually lists).

Returns

None Modifies qualifiers in place.

Source code in src/baktfold/io/insdc.py
def move_product_to_note_if_exists(qualifiers):
    """
    If a 'product' qualifier exists, append it to 'note' and remove 'product'.

    Designed for the eukaryotic entries

    Parameters
    ----------
    qualifiers : dict
        Feature qualifiers dictionary (values are usually lists).

    Returns
    -------
    None
        Modifies qualifiers in place.
    """
    product = qualifiers.get("product")
    if not product:
        return

    # Ensure note exists and is a list
    if "note" not in qualifiers:
        qualifiers["note"] = []

    if isinstance(product, list):
        qualifiers["note"].extend(product)
    else:
        qualifiers["note"].append(product)

    qualifiers.pop("product", None)

revise_dbxref_insdc(dbxrefs)

Remove INSDC non-compliant DbXrefs.

Source code in src/baktfold/io/insdc.py
def revise_dbxref_insdc(dbxrefs: Sequence[str]) -> Tuple[Sequence[str], Sequence[str]]:
    """Remove INSDC non-compliant DbXrefs."""
    insdc_valid_dbxrefs = [bc.DB_XREF_UNIPROTKB, bc.DB_XREF_GO, bc.DB_XREF_PFAM, bc.DB_XREF_RFAM]
    valid_dbxrefs = []
    invalid_dbxrefs = []
    for dbxref in dbxrefs:
        if(dbxref.split(':')[0] in insdc_valid_dbxrefs):
            valid_dbxrefs.append(dbxref)
        else:
            invalid_dbxrefs.append(dbxref)
    return valid_dbxrefs, invalid_dbxrefs

revise_product_insdc(product)

Revise product name for INSDC compliant submissions

Source code in src/baktfold/io/insdc.py
def revise_product_insdc(product: str):
    """Revise product name for INSDC compliant submissions"""

    old_product = product
    if(re.search(r'(uncharacteri[sz]ed)', product, flags=re.IGNORECASE)):  # replace putative synonyms)
        product = re.sub(r'(uncharacteri[sz]ed)', 'putative', product, flags=re.IGNORECASE)
        logger.info('fix product: replace putative synonyms. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('(') != product.count(')')):  # remove unbalanced parentheses
        product = product.replace('(', '').replace(')', '')  # ToDo: find and replace only legend parentheses
        logger.info('fix product: remove unbalanced parantheses. new=%s, old=%s', product, old_product)

    old_product = product
    if(product.count('[') != product.count(']')):  # remove unbalanced brackets
        product = product.replace('[', '').replace(']', '')  # ToDo: find and replace only legend bracket
        logger.info('fix product: remove unbalanced brackets. new=%s, old=%s', product, old_product)

    return product

parse_json_input(input_path, faa_path, all_proteins, protein_json_flag)

Parses genome annotations from input JSON file.

Parameters:

Name Type Description Default
input_path str

Path to input JSON file.

required
faa_path str

Path to output file for hypothetical proteins.

required
all_proteins bool

Whether to keep all proteins or only hypothetical ones.

required
protein_json_flag bool

Whether input is protein JSON

required

Returns:

Name Type Description
tuple

A tuple containing the data, features, and whether there are duplicate locus tags.

Examples:

>>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
(data, features, False, False)
Source code in src/baktfold/io/json_in.py
def parse_json_input(input_path, faa_path, all_proteins, protein_json_flag):
    """
    Parses genome annotations from input JSON file.

    Args:
      input_path (str): Path to input JSON file.
      faa_path (str): Path to output file for hypothetical proteins.
      all_proteins (bool): Whether to keep all proteins or only hypothetical ones.
      protein_json_flag (bool): Whether input is protein JSON

    Returns:
      tuple: A tuple containing the data, features, and whether there are duplicate locus tags.

    Examples:
      >>> parse_json_input('input.json', 'hypotheticals.faa', False, False)
      (data, features, False, False)
    """



    ############################################################################
    # Checks and configurations
    # - check parameters and setup global configuration
    # - test database
    # - test binary dependencies
    ############################################################################

    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        annotation_path = Path(input_path).resolve()
        cfg.check_readability('annotation', annotation_path)
        cfg.check_content_size('annotation', annotation_path)
    except:
        logger.error(f'ERROR: annotation file {annotation_path} not valid!')

    #print(f'baktfold v{cfg.version}')

    logger.info(f'Parsing annotations from input: {annotation_path}')
    with xopen(str(annotation_path), threads=0) as fh:
        data = json.load(fh)


    features = data['features']

    # features_by_sequence = {seq['id']: [] for seq in data['sequences']}
    # for feature in data['features']:
    #     seq_id = feature['sequence'] if 'sequence' in feature else feature['contig']  # <1.10.0 compatibility
    #     sequence_features = features_by_sequence.get(seq_id)
    #     sequence_features.append(feature)

    # keep all proteins
    if all_proteins:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS ]
    else:
        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]


    if protein_json_flag: # this will also be only hypotheticals if protein mode (or else why not just run with the FASTA)
        version = data.get("version", {})
        return features, hypotheticals, version


    # check if dupe locus tags (euks can have multiple CDS same locus tag e.g. Cladocopium goreaui CAMXCT020000001.1)
    seen_loci = set()
    has_duplicate_locus = False

    for feat in hypotheticals:
        locus = feat['locus']
        if locus in seen_loci:
            has_duplicate_locus = True
            logger.warning("Multiple CDS per locus tag were detected in your input JSON.")
            logger.warning("CDS id (which is unique) rather than locus tag will be used for ProstT5+Foldseek searches.")
            break
        seen_loci.add(locus)

    # this is done after getting all the sequences into the dict for baktfold proteins

    if has_duplicate_locus:
        # write hypothetical proteins to file with id (not locus) as guaranteed exists and unique
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['id']}\n{feat['aa']}\n")

    else:
        # write hypothetical proteins to file - almost always
        with faa_path.open('wt') as fh:
            for feat in hypotheticals:
                fh.write(f">{feat['locus']}\n{feat['aa']}\n")

    # none of this is relevant for proteins
    try:
        genome_block = data.get("genome")

        if genome_block is None:
            logger.error("No 'genome' block found in input JSON. Please check.")
            translation_table = None
        else:
            if "translation_table" not in genome_block:
                logger.error("No translation table found in input JSON. Please check your input.")
            else:
                raw_value = genome_block["translation_table"]

                try:
                    translation_table = int(raw_value)
                    logger.info(
                        f"Translation table {translation_table} detected from input JSON"
                    )

                except (ValueError, TypeError):
                    translation_table = str(raw_value)
                    logger.warning(
                        f"Translation table '{raw_value}' is not an integer. "
                        f"Parsing it as a string."
                    )

    except Exception as e:
        logger.exception(
            f"Unexpected error while parsing translation table: {e}"
        )
        translation_table = None

    # input detection

    version = data.get("version", {})

    prokka = False
    other_genbank = False

    if "prokka" in version:
        prokka = True
        logger.info("Prokka input detected")
    if  "prokka"  not in version and "bakta" not in version:
        other_genbank = True

    logger.info('Parsing complete')

    return data, features, has_duplicate_locus, translation_table, prokka, other_genbank, version

build_bakta_sequence_entry(rec)

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 11 for prokka
    if gcode is None:
        gcode = 11 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

calc_genome_stats(records)

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig Prokka GenBank file.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
    Prokka GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

convert_assembly_gap_feature(feature, rec, id)

Convert a Prokka GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The assembly_gap feature from the Prokka GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style gap feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a Prokka GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)

    qualifiers = feature.qualifiers

    # Prokka may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates

    # Bakta always uses "." for strand on gaps
    strand = "."

    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id
    }

    return gap_entry

convert_cds_feature(feature, seq_record, translation_table, id)

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------
    start = int(feature.location.start) + 1     # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)           # already one past in BioPython
    strand = "+" if feature.location.strand == 1 else "-"

    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    # ----------- Basic qualifiers -----------
    gene = feature.qualifiers.get("gene", [None])[0]
    product = feature.qualifiers.get("product", [None])[0]

    locus_tag = feature.qualifiers.get("locus_tag", [None])[0]
    locus = locus_tag

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": feature.qualifiers.get("db_xref", [so.SO_CDS.id]),  # there will be no other db_xref 
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
    }

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

convert_misc_rna_feature(feature, rec, id)

Convert a Prokka GenBank misc_RNA (nc_rna) feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The misc_RNA feature from the Prokka GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_RNA feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank misc_RNA (nc_rna) feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The misc_RNA feature from the Prokka GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)

    # Coordinates (GBK is 0-based, Bakta is 1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers

    # Fields that Prokka may or may not include
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]

    # If gene missing, use product (Bakta often fills 'gene' for sRNA/tmRNA/etc.)
    if gene is None:
        gene = product

    # Extract nucleotide sequence
    nt_seq = seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]

    misc_entry = {
        "type": "ncRNA", # bakta uses ncRNA -> will be misc_rna in Prokka
        "class": None,             # bakta's classes are not in Prokka
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "score": None,
        "evalue": None,
        "db_xrefs": [so.SO_NCRNA_GENE.id],        
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return misc_entry

convert_repeat_region_feature(feature, rec, id)

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    start = int(feature.location.start) + 1
    stop = int(feature.location.end)

    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    strand = "?"

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]

    # Minimal Bakta-like CRISPR structure
    crispr_entry = {
        "type": "crispr",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "CRISPR"
        "rpt_type": rpt_type,       # e.g., "direct"
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": [so.SO_CRISPR.id]
    }

    return crispr_entry

convert_rrna_feature(feature, rec, id)

Convert a Prokka GenBank rRNA feature to Bakta-style JSON.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The rRNA feature from the Prokka GBK.

required
rec

str The record from the GBK.

required

Returns:

Name Type Description
dict

Bakta-style rRNA feature

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_rrna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank rRNA feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the Prokka GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """
    start = int(feature.location.start) + 1  # GBK is 0-based
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers
    product = qualifiers.get("product", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]

    # Infer gene type from product if possible
    gene_map = {
        "16S ribosomal RNA": "rrs",
        "23S ribosomal RNA": "rrl",
        "5S ribosomal RNA": "rrf"
    }
    gene = gene_map.get(product, None)

    so_map = {
        "16S ribosomal RNA": so.SO_RRNA_16S.id,
        "23S ribosomal RNA": so.SO_RRNA_23S.id,
        "5S ribosomal RNA": so.SO_RRNA_5S.id
    }

    specific_so = so_map.get(product, None)

    contig_seq = str(rec.seq)

    nt_seq = contig_seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]

    rrna_entry = {
        "type": "rRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "coverage": None,  # Prokka does not provide
        "score": None,     # Prokka does not provide
        "evalue": None,    # Prokka does not provide
        "db_xrefs": [so.SO_RRNA.id, specific_so], 
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return rrna_entry

convert_tmrna_feature(feature, rec, id)

Convert a Prokka GenBank tmRNA feature to Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The tmRNA feature from the Prokka GBK.

required
rec

str The record from the GBK

required

Returns:

Name Type Description
dict

Bakta-style tmRNA feature

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_tmrna_feature(feature, rec, id):
    """
    Convert a Prokka GenBank tmRNA feature to Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The tmRNA feature from the Prokka GBK.
        rec: str
            The record from the GBK

    Returns:
        dict: Bakta-style tmRNA feature
    """

    seq =  str(rec.seq)

    start = int(feature.location.start) + 1  # GBK is 0-based
    stop = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]
    locus_tag = qualifiers.get("locus_tag", [None])[0]
    product = qualifiers.get("product", [None])[0]

    # Extract the nucleotide sequence of the tmRNA
    nt_seq = seq[start-1:stop]
    if strand == "-":
        comp = str.maketrans("ACGTacgt", "TGCAtgca")
        nt_seq = nt_seq.translate(comp)[::-1]



    tmrna_entry = {
        "type": "tmRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": gene,
        "product": product,
        "db_xrefs": [so.SO_TMRNA.id], 
        # "tag": tag_info,  no tag in tmrna for prokka - no information on it in the output
        "nt": nt_seq,
        "id": id,
        "locus": locus_tag
    }

    return tmrna_entry

convert_trna_feature(feature, seq_record, id)

Convert a Prokka tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def convert_trna_feature(feature, seq_record, id):
    """
    Convert a Prokka tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------
    start = int(feature.location.start) + 1
    stop  = int(feature.location.end)
    strand = "+" if feature.location.strand == 1 else "-"

    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]
    locus = feature.qualifiers.get("locus_tag", [None])[0]

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in prokka output
        "id": id,
        "locus": locus
    }

    return bakta_trna

get_bakta_style_id_from_locus_tag(records)

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first Prokka record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/prokka_gbk_to_json.py
def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first Prokka record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 6:

                        locus_tag_prefix = locus_tag[:-6] # trims off _00001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default prokka locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

get_transl_table(records)

Gets translation table based off the first CDS on the first record

Source code in src/baktfold/io/prokka_gbk_to_json.py
def get_transl_table(records):
    """
    Gets translation table based off the first CDS on the first record
    """

    if not records:
        raise ValueError("No GenBank records found.")

    record_1 = records[0]

    for feat in record_1.features:
        if feat.type == "CDS":
            # Translation table may be string → convert to int
            transl = feat.qualifiers.get("transl_table", ["11"])[0]
            try:
                return int(transl)
            except ValueError:
                return 11

        # If no CDS found, default to 11
        return 11

parse_prokka_version(record)

Extract Prokka version from COMMENT field: Example COMMENT: 'Annotated using prokka 1.14.6 ...'

Source code in src/baktfold/io/prokka_gbk_to_json.py
def parse_prokka_version(record):
    """
    Extract Prokka version from COMMENT field:
    Example COMMENT:
    'Annotated using prokka 1.14.6 ...'
    """

    comments = record.annotations.get("comment", "") or record.annotations.get("comments", "")
    if not comments:
        return "unknown"

    m = re.search(r"[Pp]rokka[\s_]?v?(\d+\.\d+\.\d+)", comments)
    if m:
        return m.group(1)

    # fallback pattern
    m = re.search(r"prokka[^0-9]*(\d+\.\d+\.\d+)", comments)
    if m:
        return m.group(1)

    return "unknown"

prokka_gbk_to_json(records, output_json)

Convert Prokka-generated GenBank SeqRecord objects into a Bakta-style JSON annotation file.

This function takes one or more Biopython SeqRecord objects (typically parsed from a Prokka GenBank file) and reconstructs a JSON structure following the Bakta output schema. It extracts genome metadata, statistics, annotated features, nucleotide sequences, and Prokka version information. Features are converted to Bakta-compatible dictionaries and sorted in the same order Bakta expects.

The JSON file contains

• genome block – high-level organism metadata (genus, species, strain, etc.) • stats block – genome statistics derived from all records • features block – all features converted to Bakta-style objects, sorted by feature type and genomic position • sequences block – contig/sequence entries in Bakta format • run block – timestamps and duration placeholder • version block – Prokka version and database metadata

Parameters

list of SeqRecord

A list of Biopython SeqRecord objects already parsed from a Prokka GenBank file. Must contain at least one record. The COMMENT field is expected to contain Prokka metadata.

str

Path to the output JSON file to be written.

Returns

bool True if the JSON was successfully written.

Raises

ValueError If records is empty.

Notes

• Features are processed in a fixed Bakta-like order: ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"] • Feature IDs are generated in Bakta-style using the locus tag prefix. • Per-contig sorting is performed by genomic start coordinate. • Runtime values in the run block are placeholders (duration = "0.00 min"). • The function does not validate that the GenBank file truly originates from Prokka; that should be checked beforehand.

Source code in src/baktfold/io/prokka_gbk_to_json.py
def prokka_gbk_to_json(records, output_json):
    """
    Convert Prokka-generated GenBank SeqRecord objects into a Bakta-style JSON
    annotation file.

    This function takes one or more Biopython SeqRecord objects (typically parsed
    from a Prokka GenBank file) and reconstructs a JSON structure following the
    Bakta output schema. It extracts genome metadata, statistics, annotated features,
    nucleotide sequences, and Prokka version information. Features are converted
    to Bakta-compatible dictionaries and sorted in the same order Bakta expects.

    The JSON file contains:
      • `genome` block – high-level organism metadata (genus, species, strain, etc.)
      • `stats` block – genome statistics derived from all records
      • `features` block – all features converted to Bakta-style objects, sorted
        by feature type and genomic position
      • `sequences` block – contig/sequence entries in Bakta format
      • `run` block – timestamps and duration placeholder
      • `version` block – Prokka version and database metadata

    Parameters
    ----------
    records : list of SeqRecord
        A list of Biopython SeqRecord objects already parsed from a Prokka GenBank
        file. Must contain at least one record. The COMMENT field is expected to
        contain Prokka metadata.

    output_json : str
        Path to the output JSON file to be written.

    Returns
    -------
    bool
        True if the JSON was successfully written.

    Raises
    ------
    ValueError
        If `records` is empty.

    Notes
    -----
    • Features are processed in a fixed Bakta-like order:
        ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"]
    • Feature IDs are generated in Bakta-style using the locus tag prefix.
    • Per-contig sorting is performed by genomic start coordinate.
    • Runtime values in the `run` block are placeholders (duration = "0.00 min").
    • The function does not validate that the GenBank file truly originates from
      Prokka; that should be checked beforehand.
    """

    complete = False

    # records = list(SeqIO.parse(genbank_path, "genbank"))

    if len(records) == 0:
        raise ValueError("No GenBank records found.")
    elif len(records) >= 1:
        prokka_version = parse_prokka_version(records[0])

    translation_table = get_transl_table(records)

    bakta_id_prefix = get_bakta_style_id_from_locus_tag(records)

    # ----------------------------
    # Genome block 
    # ----------------------------

    genome_block = {
        "genus": None,
        "species": None,
        "strain": None,
        "taxon": None,
        "complete": True,
        "gram": "?",
        "translation_table": translation_table
    }

    # ----------------------------
    # Stats - on whole GBK
    # ----------------------------
    stats_block = calc_genome_stats(records)

    # ----------------------------
    # Features block
    # ----------------------------

    # order by id creation block to match how ids are generated bakta
    ORDER = ["tRNA", "tmRNA", "rRNA", "misc_RNA", "repeat_region", "CDS", "assembly_gap"]
    # bakta has oriC detection too - prokka doesn't I think so leaving it out 

    features = []
    i = 1
    for rec in records:
        for ftype in ORDER:
            for feat in rec.features:
                if feat.type != ftype:
                    continue

                id = f"{bakta_id_prefix}_{i}"

                if ftype == "CDS":
                    features.append(convert_cds_feature(feat, rec, translation_table, id))
                elif ftype == "tRNA":
                    features.append(convert_trna_feature(feat, rec, id))
                elif ftype == "tmRNA":
                    features.append(convert_tmrna_feature(feat, rec, id))
                elif ftype == "rRNA":
                    features.append(convert_rrna_feature(feat, rec, id))
                elif ftype == "misc_RNA":
                    features.append(convert_misc_rna_feature(feat, rec, id))
                elif ftype == "repeat_region":
                    features.append(convert_repeat_region_feature(feat, rec, id))
                elif ftype == "assembly_gap":
                    features.append(convert_assembly_gap_feature(feat, rec, id))

                i +=1


    # ----------------------------
    # Sort features within each contig like Bakta
    # ----------------------------

    features_by_contig = defaultdict(list)
    for f in features:
        features_by_contig[f["sequence"]].append(f)

    # Sort each contig's features by start and flatten back
    sorted_features = []
    for contig_id in features_by_contig:
        sorted_features.extend(sorted(features_by_contig[contig_id], key=lambda x: x["start"]))

    # Replace the original features list
    features = sorted_features

    # ----------------------------
    # Sequences block
    # ----------------------------

    sequences = []
    for rec in records:
        sequences.append(build_bakta_sequence_entry(rec))

    # just to put in a time
    start_time = datetime.now()

    bakta_json = {
        "genome": genome_block,
        "stats": stats_block,
        "features": features,
        "sequences": sequences,
        "run": {
            "start": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end": start_time.strftime("%Y-%m-%d %H:%M:%S"),       
            "duration": "0.00 min"   
        },
        "version": {
            "prokka": prokka_version,  
            "db": {
                "version": prokka_version,
                "type": "prokka_dbs"
            }
        }
    }


    Path(output_json).parent.mkdir(parents=True, exist_ok=True)
    with open(output_json, "w") as fh:
        json.dump(bakta_json, fh, indent=4)
        complete = True

    return complete

random_n_letter_id(n=4)

generates a n letter id prefix

n=2 to append to Prokka locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/prokka_gbk_to_json.py
def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to  Prokka locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

Module for manipulating genbank files some taken from phynteny https://github.com/susiegriggo/Phynteny

get_genbank(genbank)

Convert a GenBank file to a dictionary.

This function reads a GenBank file and converts it into a dictionary.

Parameters:

Name Type Description Default
genbank Path

Path to the GenBank file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the GenBank file.

Raises:

Type Description
ValueError

If the provided file is not a GenBank file.

Source code in src/baktfold/io/handle_genbank.py
def get_genbank(genbank: Path) -> dict:
    """
    Convert a GenBank file to a dictionary.

    This function reads a GenBank file and converts it into a dictionary.

    Args:
        genbank (Path): Path to the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.

    Raises:
        ValueError: If the provided file is not a GenBank file.
    """

    logger.info(f"Checking if input {genbank} is a Genbank format file")
    logger.info(f"If so, also detecting the likely input style out of Pharokka, Bakta and NCBI Refseq style.")
    def parse_records(handle):
        """
    Parses a genbank file and returns a list of SeqRecords.

    Args:
      file_path (str): The path to the genbank file to parse.
      file_format (str): The format of the genbank file. Defaults to 'genbank'.

    Returns:
      list: A list of SeqRecords parsed from the genbank file.

    Examples:
      >>> parse_records('example.gb')
      [SeqRecord(seq=Seq('ATGC'), id='example', name='example', description='example', dbxrefs=[]), ...]
    """
        try:
            records = list(SeqIO.parse(handle, "gb"))
            if not records:
                return {}, None
            gb_dict = {record.id: record for record in records}
            record = records[0]

            comment = record.annotations.get("comment", "")
            cds_feature = next((f for f in record.features if f.type == "CDS"), None)

            if cds_feature is None:
                logger.error(f"{genbank} appears to be a Genbank formatted file but no CDS was found. Please check your input.")
                return gb_dict, None

            # Check if 'Bakta' appears in the Comment - will appear there
            if "Bakta" in comment and "locus_tag" in cds_feature.qualifiers:
                logger.info(f"Detected Bakta style input Genbank. Using locus_tag qualifier from Bakta as the CDS IDs for Phold.")
                method = "Bakta"
            else:
                if "phrog" not in cds_feature.qualifiers and "protein_id" in cds_feature.qualifiers:
                    logger.info(f"Detected NCBI Refseq style input Genbank. Using protein_id qualifier as the CDS IDs for Phold.")
                    method = "NCBI"
                elif "phrog" in cds_feature.qualifiers and "ID" in cds_feature.qualifiers:
                    logger.info(f"Detected Pharokka style input Genbank. Using ID qualifier from Pharokka as the CDS IDs for Phold.")
                    method = "Pharokka"
                else:
                    logger.error(
                                f"Feature {cds_feature} could not be parsed. Therefore, the input style format for {genbank} could not be detected. Please check your input."
                            )
            return identify_long_ids(gb_dict), method
        except Exception as e:
            logger.warning(f"{genbank} is not a genbank file")
            return {}, None

    try:
        if is_gzip_file(genbank.strip()):
            with gzip.open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
        else:
            with open(genbank.strip(), "rt") as handle:
                return parse_records(handle)
    except Exception as e:
        logger.warning(f"{genbank} is not a genbank file")
        return {}, None

get_proteins(fasta)

Convert an Amino Acid FASTA file to a dictionary.

This function reads a AA FASTA file and converts it into a dictionary.

Parameters:

Name Type Description Default
fasta Path

Path to the FASTA file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the FASTA file.

Raises:

Type Description
ValueError

If the provided file is not a FASTA file.

Source code in src/baktfold/io/handle_genbank.py
def get_proteins(fasta: Path) -> dict:
    """
    Convert an Amino Acid FASTA file to a dictionary.

    This function reads a AA FASTA file and converts it into a dictionary.

    Args:
        fasta (Path): Path to the FASTA file.

    Returns:
        dict: A dictionary representation of the FASTA file.

    Raises:
        ValueError: If the provided file is not a FASTA file.
    """

    if is_gzip_file(fasta.strip()):
        try:
            fasta_dict = {}
            with gzip.open(fasta.strip(), "rt") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
            handle.close()
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    else:
        try:
            fasta_dict = {}
            with open(fasta.strip(), "rt", errors="ignore") as handle:
                sequence_id = ""
                sequence = ""
                for line in handle:
                    line = line.strip()
                    if line.startswith(">"):
                        if sequence_id:
                            fasta_dict[sequence_id] = sequence
                        sequence_id = line[1:]
                        sequence = ""
                    else:
                        sequence += line
                if sequence_id:
                    fasta_dict[sequence_id] = sequence
            handle.close()
        except ValueError:
            logger.error(f"{fasta.strip()} is not a FASTA file!")
            raise

    return fasta_dict

identify_long_ids(gb_dict)

Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

Parameters:

Name Type Description Default
dict

A dictionary representation of the GenBank file.

required

Returns:

Name Type Description
dict dict

A dictionary representation of the GenBank file.

Source code in src/baktfold/io/handle_genbank.py
def identify_long_ids(gb_dict: dict) -> dict:
    """

    Checks all feature IDs in gb_dict. If longer than 54 chars (line break from Pharokka/biopython reading GBK files), removes the space

    Args:
        dict: A dictionary representation of the GenBank file.

    Returns:
        dict: A dictionary representation of the GenBank file.
    """

    # remove spaces in ID/locus tag
    for record_id, record in gb_dict.items():
        for cds_feature in record.features:
            try:
                # if pharokka > 54 char IDs/locus tage, phold/biopython will parse with a space
                # no spaces in
                # for really long CDS IDs (over 54 chars), a space will be introduced
                # this is because the ID will go over a second line
                # weird bug noticed it on the Mgnify contigs annotated with Pharokka
                cds_id = cds_feature.qualifiers["ID"][0]
                if len(cds_id) >= 54:
                    logger.warning(
                        f"The CDS ID is {cds_id} is longer than 54 characters. It is recommended that you use short contig headers (which will therefore lead to shorter CDS ids)."
                    )
                    cds_feature.qualifiers["ID"][0] = cds_feature.qualifiers["ID"][
                        0
                    ].replace(" ", "")
            except:
                # will be GenBank/NCBI formatted
                # ID isn't a field and should be properly formatted - famous last words probably
                continue

    return gb_dict

is_gzip_file(f)

Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

This is an elegant solution to test whether a file is gzipped by reading the first two characters. I also use a version of this in fastq_pair if you want a C version :) See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration

Parameters:

Name Type Description Default
f Path

The file to test.

required

Returns:

Name Type Description
bool bool

True if the file is gzip compressed, otherwise False.

Source code in src/baktfold/io/handle_genbank.py
def is_gzip_file(f: Path) -> bool:
    """
    Method copied from Phispy see https://github.com/linsalrob/PhiSpy/blob/master/PhiSpyModules/helper_functions.py

    This is an elegant solution to test whether a file is gzipped by reading the first two characters.
    I also use a version of this in fastq_pair if you want a C version :)
    See https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed for inspiration
    Args:
        f (Path): The file to test.

    Returns:
        bool: True if the file is gzip compressed, otherwise False.
    """
    with open(f, "rb") as i:
        return binascii.hexlify(i.read(2)) == b"1f8b"

open_protein_fasta_file(input_file)

Open a fasta file, whether it is gzipped or plain text.

input_file (str): The path to the fasta file, either gzipped or plain.

Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.

Source code in src/baktfold/io/handle_genbank.py
def open_protein_fasta_file(input_file: str) -> Union[IO[str], gzip.GzipFile]:
    """
    Open a fasta file, whether it is gzipped or plain text.

    Parameters:
    input_file (str): The path to the fasta file, either gzipped or plain.

    Returns:
    Union[IO[str], gzip.GzipFile]: A file handle to the opened fasta file.
    """
    input_file = Path(input_file)

    if input_file.suffix == ".gz":
        return gzip.open(input_file, "rt")
    else:
        return open(input_file, "r")

add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None)

Add optional INSDC qualifiers to a feature entry dict in Bakta style.

Parameters

dict

The feature dictionary being built.

dict

The qualifiers dictionary from Bio.SeqFeature.

set or list

Qualifiers expected to be single-valued (take the first if multiple).

set or list

Qualifiers that can have multiple values (keep as list if >1, else single value).

Source code in src/baktfold/io/eukaryotic_to_json.py
def add_optional_qualifiers(entry, qualifiers, single_valued=None, multi_valued=None):
    """
    Add optional INSDC qualifiers to a feature entry dict in Bakta style.

    Parameters
    ----------
    entry : dict
        The feature dictionary being built.
    qualifiers : dict
        The qualifiers dictionary from Bio.SeqFeature.
    single_valued : set or list
        Qualifiers expected to be single-valued (take the first if multiple).
    multi_valued : set or list
        Qualifiers that can have multiple values (keep as list if >1, else single value).
    """

    single_valued = single_valued or set()
    multi_valued = multi_valued or set()

    # Multi-valued qualifiers
    for key in multi_valued:
        vals = qualifiers.get(key)
        if vals:
            entry[key] = vals if len(vals) > 1 else vals[0]

    # Single-valued qualifiers
    for key in single_valued:
        vals = qualifiers.get(key)
        if vals:
            if key == "locus_tag":
                entry["locus"] = vals[0] # this is what bakta needs
            else:
                entry[key] = vals[0]

build_bakta_sequence_entry(rec)

Convert a SeqRecord into a Bakta-style sequence entry. Missing fields are filled with None.

Source code in src/baktfold/io/eukaryotic_to_json.py
def build_bakta_sequence_entry(rec):
    """
    Convert a  SeqRecord into a Bakta-style sequence entry.
    Missing fields are filled with None.
    """

    seq = str(rec.seq)

    # -----------------------------------------
    # Extract source feature qualifiers - genbank always has source field
    # -----------------------------------------
    source_feat = next((f for f in rec.features if f.type == "source"), None)

    source_qualifiers = {}

    # Defaults (None) for all fields
    mol_type = None
    organism = None
    strain = None
    db_xref = None
    note = None

    plasmid = None
    chromosome = None
    completeness_hint = None

    if source_feat:
        q = source_feat.qualifiers

        mol_type = q.get("mol_type", [None])[0]
        organism = q.get("organism", [None])[0]
        strain = q.get("strain", [None])[0]
        note = q.get("note", [None])[0]

        if "db_xref" in q:
            val = q["db_xref"]
            db_xref = val[0] if len(val) == 1 else val

        plasmid = q.get("plasmid", [None])[0]
        chromosome = q.get("chromosome", [None])[0]
        completeness_hint = q.get("completeness", [None])[0]

    # -----------------------------------------
    # Infer topology
    # -----------------------------------------
    topology = rec.annotations.get("topology")
    if topology not in {"linear", "circular"}:
        topology = "linear"

    # -----------------------------------------
    # Infer type
    # -----------------------------------------
    if plasmid is not None or "plasmid" in rec.annotations:
        seq_type = "plasmid"
    elif chromosome is not None or "chromosome" in rec.annotations:
        seq_type = "chromosome"
    else:
        seq_type = "contig"

    # -----------------------------------------
    # Infer completeness (conservative)
    # -----------------------------------------
    complete = False

    if topology == "circular":
        complete = True
    elif completeness_hint is not None and completeness_hint.lower() == "complete":
        complete = True
    elif note and "complete genome" in note.lower():
        complete = True

    # -----------------------------------------
    # Infer genetic codefor description
    # -----------------------------------------
    gcode = None

    if "genetic_code" in rec.annotations:
        gcode = rec.annotations["genetic_code"]
    elif "gcode" in rec.annotations:
        gcode = rec.annotations["gcode"]
    elif source_feat and "transl_table" in source_feat.qualifiers:
        gcode = source_feat.qualifiers["transl_table"][0]

    # Conservative fallback to 1 for euks
    if gcode is None:
        gcode = 1 

    description_parts = [
        f"[gcode={gcode}]",
        f"[topology={topology}]",
    ]

    description = " ".join(description_parts)

    # -----------------------------------------
    # Build entry
    # -----------------------------------------
    entry = {
        "id": rec.id,
        "description": description,
        "nt": seq,
        "length": len(seq),
        "complete": complete,
        "type": seq_type,
        "topology": topology,
        "simple_id": rec.id,
        "orig_id": rec.id,
        "orig_description": None,
    }

    # -----------------------------------------
    # Add source qualifiers if present
    # -----------------------------------------
    if organism is not None:
        entry["organism"] = organism
    if mol_type is not None:
        entry["mol_type"] = mol_type
    if strain is not None:
        entry["strain"] = strain
    if db_xref is not None:
        entry["db_xref"] = db_xref
    if note is not None:
        entry["note"] = note


    # this is from bakta
    # "id": "contig_1",
    # "description": "[gcode=11] [topology=linear]",
    # "nt": "AT"
    # "length": 5165988,
    # "complete": false,
    # "type": "contig",
    # "topology": "linear",
    # "simple_id": "contig_1",
    # "orig_id": "GCF_002368115_000000000001",
    # "orig_description": ""

    # Add source qualifiers only if they exist
    if organism is not None:
        entry["organism"] = organism

    if mol_type is not None:
        entry["mol_type"] = mol_type

    if strain is not None:
        entry["strain"] = strain

    if db_xref is not None:
        entry["db_xref"] = db_xref

    if note is not None:
        entry["note"] = note

    return entry

calc_genome_stats(records)

Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig GenBank file.

Source code in src/baktfold/io/eukaryotic_to_json.py
def calc_genome_stats(records):
    """
    Compute correct genome stats (size, GC, N-ratio, N50, N90) for records from a multi-contig
     GenBank file.
    """

    if not records:
        raise ValueError("No GenBank records found.")

    # lengths of all contigs
    contig_lengths = [len(r.seq) for r in records]
    total_length = sum(contig_lengths)

    # concatenate sequences for global GC + N calculation
    full_seq = "".join(str(r.seq) for r in records)

    # GC as fraction (Bakta wants 0–1)
    gc_perc = gc_fraction(full_seq)

    # N-ratio
    n_ratio = full_seq.count("N") / total_length

    # ---------- N50 / N90 ----------
    sorted_lengths = sorted(contig_lengths, reverse=True)

    def nx_metric(sorted_lens, total, threshold):
        """
        Generic N{threshold} function.
        threshold: 0.5 for N50, 0.9 for N90
        """
        cutoff = total * threshold
        running = 0
        for l in sorted_lens:
            running += l
            if running >= cutoff:
                return l
        return sorted_lens[-1]  # fallback (should not happen)

    n50 = nx_metric(sorted_lengths, total_length, 0.5)
    n90 = nx_metric(sorted_lengths, total_length, 0.9)

    return {
        "size": total_length,
        "gc": gc_perc,
        "n_ratio": n_ratio,
        "n50": n50,
        "n90": n90,
        "coding_ratio": None  
    }

convert_assembly_gap_feature(feature, rec, id)

Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The assembly_gap feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_assembly_gap_feature(feature, rec, id):
    """
    Convert a GenBank assembly_gap feature to a simplified Bakta-style 'gap' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.
    """

    # Coordinates (1-based)
    strand = "." # bakta uses "." for strand on gaps
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    #  may provide estimated_length but coordinates already give an exact span
    est_len = qualifiers.get("estimated_length", [None])[0]
    if est_len is not None:
        length = int(est_len)
    else:
        length = stop - start + 1  # fallback from coordinates


    gap_entry = {
        "type": "gap",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "length": length,
        "id": id,
    }

    # no need to add estimated length separately - it is covered by length in the json 

    # if est_len:
    #     gap_entry["estimated_length"] = est_len

    return gap_entry

convert_cds_feature(feature, seq_record, translation_table, id)

Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_cds_feature(feature, seq_record, translation_table, id):
    """
    Convert a Prokka CDS Biopython SeqFeature to a Bakta CDS JSON entry.
    """

    # ----------- Location info -----------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    # frame: Bakta uses 1/2/3; Prokka codon_start is ["1","2","3"]
    codon_start = int(feature.qualifiers.get("codon_start", ["1"])[0])
    frame = codon_start

    qualifiers = feature.qualifiers

    # ----------- Basic qualifiers -----------
    gene = qualifiers.get("gene", [None])[0]
    product = qualifiers.get("product", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    note = qualifiers.get("note", [None])[0]
    locus = locus_tag

    # pseudo

    protein_id = qualifiers.get("protein_id", [None])[0]

    # ----------- Extract nucleotides -----------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ----------- Extract amino acids -----------
    aa = feature.qualifiers.get("translation", [""])[0]

    # Compute translation if Prokka didn't provide it
    if not aa:
        try:
            aa = str(nt_seq.translate(table=translation_table, cds=True))
        except Exception:
            aa = ""

    # ----------- aa MD5 hexdigest -----------
    aa_hexdigest = hashlib.md5(aa.encode()).hexdigest()

    # ----------- Hypothetical? -----------
    hypothetical = product is None or "hypothetical protein" in product.lower()

    # ----------- Compute protein stats -----------
    seq_stats = None
    if aa:
        try:
            analysed = ProteinAnalysis(aa)
            seq_stats = {
                "molecular_weight": analysed.molecular_weight(),
                "isoelectric_point": analysed.isoelectric_point()
            }
        except Exception:
            seq_stats = None

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xref = qualifiers.get("db_xref", [so.SO_CDS.id])

    # Append so.SO_CDS.id only if it’s not already present
    if so.SO_CDS.id not in db_xref:
        db_xref.append(so.SO_CDS.id)

    # ----------- Make Bakta-format dict -----------
    bakta_cds = {
        "type": "cds",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "frame": frame,
        "gene": gene,
        "product": product,
        "db_xrefs": db_xref,  
        "nt": nt,
        "aa": aa,
        "aa_hexdigest": aa_hexdigest,
        "start_type": None,
        "rbs_motif": None,
        "genes": [],
        "note": note,
        "seq_stats": seq_stats,
        "id": id,
        "locus": locus,
        "protein_id": protein_id
    }

# Feature Key           CDS

# Definition            coding sequence; sequence of nucleotides that
#                       corresponds with the sequence of amino acids in a
#                       protein (location includes stop codon); 
#                       feature includes amino acid conceptual translation.

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /codon_start=<1 or 2 or 3>
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /exception="[exception_value]"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /protein_id="<identifier>"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /ribosomal_slippage
#                       /standard_name="text"
#                       /translation="text"
#                       /transl_except=(pos:<location>,aa:<amino_acid>)
#                       /transl_table =<integer>
#                       /trans_splicing

    multi_valued = {"EC_number", "exception", "experiment", "function",  "gene_synonym",  "inference", }
    single_valued = {"allele", "artificial_location",  "map", "number",  "old_locus_tag", "operon", "phenotype", "pseudogene", "standard_name", "transl_except", "transl_table"}

    add_optional_qualifiers(bakta_cds, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "ribosomal_slippage", "trans_splicing"]:
        if flag in qualifiers:
            bakta_cds[flag] = flag in qualifiers

    if hypothetical:
        bakta_cds["hypothetical"] = True

    return bakta_cds

convert_exon_feature(feature, rec, id)

Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

Parameters

Bio.SeqFeature

The exon feature from the GenBank record.

Bio.SeqRecord

The full GenBank record.

str

Unique feature ID.

Returns

dict Bakta-style exon feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_exon_feature(feature, rec, id):
    """
    Convert a GenBank exon feature to a simplified Bakta-style 'exon' feature.

    Parameters
    ----------
    feature : Bio.SeqFeature
        The exon feature from the GenBank record.
    rec : Bio.SeqRecord
        The full GenBank record.
    id : str
        Unique feature ID.

    Returns
    -------
    dict
        Bakta-style exon feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    db_xrefs = qualifiers.get("db_xref", [])

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    # Extract commonly used INSDC qualifiers
    exon_entry = {
            "type": "exon",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }

    multi_valued = {"EC_number","experiment","function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"   }

    add_optional_qualifiers(exon_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            exon_entry[flag] = True

    return exon_entry

convert_gene_feature(feature, rec, id)

Convert a Funannotate GenBank gene feature to Bakta-style JSON.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The rRNA feature from the GBK.

required
rec

str The record from the GBK.

required

Returns:

Name Type Description
dict

Bakta-style rRNA feature

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_gene_feature(feature, rec, id):
    """
    Convert a Funannotate GenBank gene feature to Bakta-style JSON.

    Parameters:
        feature: Bio.SeqFeature
            The rRNA feature from the GBK.
        rec: str
            The record from the GBK.
    Returns:
        dict: Bakta-style rRNA feature
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")



    gene_entry = {
        "type": "gene",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "db_xrefs": [so.SO_GENE.id], 
        "id": id,
        "locus": locus_tag
    }


# Feature Key           gene 


# Definition            region of biological interest identified as a gene 
#                       and for which a name has been assigned;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /phenotype="text"
#                       /standard_name="text"
#                       /trans_splicing


# Comment               the gene feature describes the interval of DNA that 
#                       corresponds to a genetic trait or phenotype; the feature is,
#                       by definition, not strictly bound to it's positions at the 
#                       ends;  it is meant to represent a region where the gene is 
#                       located.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(gene_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            gene_entry[flag] = flag in qualifiers

    return gene_entry

convert_mat_peptide_feature(feature, rec, id)

Convert a mat_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mat_peptide_feature(feature, rec, id):
    """
    Convert a mat_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    seq = str(rec.seq)


    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    so_code =  so.SO_MAT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    qualifiers = feature.qualifiers


    # Extract commonly used INSDC qualifiers
    mat_peptide_entry = {
            "type": "mat_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            # Join support
            "starts": starts,
            "stops": stops,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs
        }


# Feature Key           mat_peptide


# Definition            mature peptide or protein coding sequence; coding
#                       sequence for the mature or final peptide or protein
#                       product following post-translational modification; the
#                       location does not include the stop codon (unlike the
#                       corresponding CDS);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /EC_number="text"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"EC_number","experiment", "function",  "gene_synonym",  "inference","note" }
    single_valued = {"allele", "gene", "locus_tag", "map", "number",   "old_locus_tag", "operon", "pseudogene", "standard_name"}

    add_optional_qualifiers(mat_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers) - no flags
    # for flag in ["pseudo"]:
    #     if flag in qualifiers:
    #         mat_peptide_entry[flag] = True


    #  mat_peptide     complement(join(194724303..194724321,194744661..194744721,
    #                  194746996..194747031,194750435..194750476,
    #                  194757818..194757865,194759962..194760144,
    #                  194764890..194765087,194765856..194765944,
    #                  194767641..194767743,194768400..194768583))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /product="Membrane cofactor protein. /id=PRO_0000238971"
    #                  /note="propagated from UniProtKB/Swiss-Prot (O88174.1)"

    return mat_peptide_entry

convert_misc_feature(feature, rec, id)

Convert a misc feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_feature feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_misc_feature(feature, rec, id):
    """
    Convert a misc feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_feature feature.
    """

    seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_MISC_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    misc_feature_entry = {
            "type": "misc_feature",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": db_xrefs,


        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note", "phenotype"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name",  "pseudogene"}

    add_optional_qualifiers(misc_feature_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            misc_feature_entry[flag] = True

# Feature Key           misc_feature


# Definition            region of biological interest which cannot be described
#                       by any other feature key; a new or rare feature;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /number=unquoted text (single token)
#                       /old_locus_tag="text" (single token)
#                       /phenotype="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               this key should not be used when the need is merely to 
#                       mark a region in order to comment on it or to use it in 
#                       another feature's location

    #  misc_feature    join(78488668..78488692,78499322..78499359)
    #                  /gene="Mogat1"
    #                  /gene_synonym="0610030A14Rik; 1110064N14Rik; Dgat2l;
    #                  Dgat2l1; mDC2; MGAT1; WI1-2612I11.1"
    #                  /note="propagated from UniProtKB/Swiss-Prot (Q91ZV4.2);
    #                  transmembrane region"

    #  misc_feature    78179419..78180585
    #                  /standard_name="Pax3 upstream hypaxial enhancer"
    #                  /note="Region: biological region; Derived by automated
    #                  computational analysis using gene prediction method:
    #                  RefSeqFE."
    #                  /function="regulatory_interactions: LOC107980439 | Pax3"
    #                  /db_xref="GeneID:107980442"    

    return misc_feature_entry

convert_misc_rna_feature(feature, rec, id)

Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The assembly_gap feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style gap feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_misc_rna_feature(feature, rec, id):
    """
    Convert a GenBank misc_rna feature to a simplified Bakta-style 'misc_rna' feature.

    Parameters:
        feature: Bio.SeqFeature
            The assembly_gap feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style gap feature.

    """

        # from ensemble genomes
        # misc_RNA        complement(437333..442742)
        #             /gene="YPL060C-A"
        #             /note="transposable_element"
        #             /standard_name="YPL060C-A"

    # Coordinates (1-based)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    gene = qualifiers.get("gene", [None])[0]

# Feature Key           misc_RNA


# Definition            any transcript or RNA product that cannot be defined by
#                       other RNA keys (prim_transcript, precursor_RNA, mRNA,
#                       5'UTR, 3'UTR, exon, CDS, sig_peptide, transit_peptide,
#                       mat_peptide, intron, polyA_site, ncRNA, rRNA and tRNA);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    misc_rna_entry = {
        "type": "misc_RNA", # expects lowercase 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "gene": gene,
        "id": id
    }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "product", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(misc_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            misc_rna_entry[flag] = flag in qualifiers

    return misc_rna_entry

convert_mobile_element_feature(feature, rec, id)

Convert a GenBank mobile_element feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mobile_element_feature(feature, rec, id):
    """
    Convert a GenBank mobile_element feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier check (INSDC requirement)
    mobile_element_type = qualifiers.get("mobile_element_type", [None])[0]
    if mobile_element_type is None:
        raise ValueError(
            f"mobile_element feature {id} is missing mandatory "
            "/mobile_element_type qualifier"
        )

    so_code =  so.SO_MOBILE_ELEMENT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


# Feature Key           mobile_element


# Definition            region of genome containing mobile elements;

# Mandatory qualifiers  /mobile_element_type="<mobile_element_type>
#                       [:<mobile_element_name>]"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /standard_name="text"


    # Extract commonly used INSDC qualifiers
    mobile_element_entry = {
            "type": "mobile_element",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,
            "db_xrefs": db_xrefs,
                    # Mandatory
            "mobile_element_type": mobile_element_type,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "standard_name", "rpt_family", "rpt_type"}

    add_optional_qualifiers(mobile_element_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["pseudo"]:
    #   if flag in qualifiers:
    #     mobile_element_entry[flag] = True


    #  mobile_element  57369551..57369723
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: RefSeqFE."
    #                  /mobile_element_type="SINE:AmnSINE1"
    #                  /db_xref="GeneID:106707176"

    return mobile_element_entry

convert_mrna_feature(feature, rec, id)

Convert a funannotate mrna feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style mRNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_mrna_feature(feature, rec, id):
    """
    Convert a funannotate mrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style mRNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    else:
        starts = None
        stops = None


    qualifiers = feature.qualifiers


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")


    mrna_entry = {
        "type": "mRNA", 
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "starts": starts,
        "stops": stops,
        "strand": strand,
        "db_xrefs": [so.SO_MRNA.id],        
        "id": id,
        "locus": locus_tag
    }



# Feature Key           mRNA


# Definition            messenger RNA; includes 5'untranslated region (5'UTR),
#                       coding sequences (CDS, exon) and 3'untranslated region
#                       (3'UTR);

# Optional qualifiers   /allele="text"
#                       /artificial_location="[artificial_location_value]"
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "artificial_location", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(mrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            mrna_entry[flag] = flag in qualifiers

    return mrna_entry

convert_ncrna_feature(feature, rec, id)

Convert a ncrna feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style misc_RNA feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_ncrna_feature(feature, rec, id):
    """
    Convert a ncrna feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style misc_RNA feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))

    qualifiers = feature.qualifiers

    so_code =  so.SO_NCRNA_GENE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Mandatory qualifier (INSDC requirement)
    ncrna_class = qualifiers.get("ncRNA_class", [None])[0]
    if ncrna_class is None:
        raise ValueError(
            f"ncRNA feature {id} is missing mandatory /ncRNA_class qualifier"
        )

    ncrna_entry = {
        "type": "ncRNA",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Mandatory
        "ncRNA_class": ncrna_class,

        # Multi-valued qualifiers
        "db_xrefs": db_xrefs,


    }

# Feature Key           ncRNA

# Definition            a non-protein-coding gene, other than ribosomal RNA and
#                       transfer RNA, the functional molecule of which is the RNA
#                       transcript;

# Mandatory qualifiers  /ncRNA_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

# Example               /ncRNA_class="miRNA"
#                       /ncRNA_class="siRNA"
#                       /ncRNA_class="scRNA"       

# Comment               the ncRNA feature is not used for ribosomal and transfer
#                       RNA annotation, for which the rRNA and tRNA feature keys
#                       should be used, respectively;

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "operon", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(ncrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            ncrna_entry[flag] = flag in qualifiers

    #  ncRNA           join(189791085..189791793,189798997..189799081,
    #                  189819873..189820364,189821703..189822337)
    #                  /ncRNA_class="lncRNA"
    #                  /gene="Gm30446"
    #                  /product="predicted gene, 30446, transcript variant X6"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: Gnomon. Supporting evidence
    #                  includes similarity to: 100% coverage of the annotated
    #                  genomic feature by RNAseq alignments, including 2 samples
    #                  with support for all annotated introns"
    #                  /transcript_id="XR_001779629.1"
    #                  /db_xref="GeneID:102632350"
    #                  /db_xref="MGI:MGI:5589605"

    return ncrna_entry

convert_precursor_rna_feature(feature, rec, id)

Convert a GenBank precursor_RNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_precursor_rna_feature(feature, rec, id):
    """
    Convert a GenBank precursor_RNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_PRECURSOR_RNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    precursor_rna_entry = {
            "type": "precursor_RNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(precursor_rna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["trans_splicing"]:
        if flag in qualifiers:
            precursor_rna_entry[flag] = True

#     Feature Key           precursor_RNA


# Definition            any RNA species that is not yet the mature RNA product;
#                       may include ncRNA, rRNA, tRNA, 5' untranslated region
#                       (5'UTR), coding sequences (CDS, exon), intervening
#                       sequences (intron) and 3' untranslated region (3'UTR);

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"  
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /standard_name="text"
#                       /trans_splicing


    #  precursor_RNA   194719348..194719428
    #                  /gene="Mir29b-2"
    #                  /gene_synonym="mir-29b-2; Mirn29b-2"
    #                  /product="microRNA 29b-2"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: BestRefSeq."
    #                  /transcript_id="NR_029809.1"
    #                  /db_xref="GeneID:723963"
    #                  /db_xref="MGI:MGI:3619047"
    #                  /db_xref="miRBase:MI0000712"

    return precursor_rna_entry

convert_proprotein_propeptide_feature(feature, rec, id)

Convert a proprotein or propeptide feature to a Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style proprotein feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_proprotein_propeptide_feature(feature, rec, id):
    """
    Convert a proprotein or propeptide feature to a Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style proprotein feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_PROPEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append so.SO_CDS.id only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    propeptide_entry = {
        "type": "propeptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }


# Feature Key           propeptide


# Definition            propeptide coding sequence; coding sequence for the domain of a 
#                       proprotein that is cleaved to form the mature protein product.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",    "old_locus_tag", "product", "standard_name", "pseudogene"}

    add_optional_qualifiers(propeptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo",]:
        if flag in qualifiers:
            propeptide_entry[flag] = True

    #  proprotein      join(171053237..171053367,171053712..171053832)
    #                  /gene="Apoa2"
    #                  /gene_synonym="Alp-2; Apo-AII; Apoa-2; ApoA-II; ApoAII;
    #                  Hdl-1"
    #                  /product="apolipoprotein A-II proprotein"  

    return propeptide_entry

convert_protein_bind_feature(feature, rec, id)

Convert a GenBank protein_bind feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_protein_bind_feature(feature, rec, id):
    """
    Convert a GenBank protein_bind feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    bound_moiety = qualifiers.get("bound_moiety", [None])[0]
    if bound_moiety is None:
        raise ValueError(
            f"protein_bind feature {id} is missing mandatory /bound_moiety qualifier"
        )

    so_code =  so.SO_PROTEINBIND.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    protein_bind_entry = {
        "type": "protein_bind",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "bound_moiety": bound_moiety,
        "db_xrefs": db_xrefs,
        "id": id,
    }


# Feature Key           protein_bind


# Definition            non-covalent protein binding site on nucleic acid;

# Mandatory qualifiers  /bound_moiety="text"

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /standard_name="text"

# Comment               note that feature key regulatory with /regulatory_class="ribosome_binding_site"
#                       should be used for ribosome binding sites.


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "standard_name"}

    add_optional_qualifiers(protein_bind_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    # for flag in ["trans_splicing"]:
    #     if flag in qualifiers:
    #         protein_bind_entry[flag] = True

    return protein_bind_entry

convert_regulatory_feature(feature, rec, id)

Convert a GenBank regulatory feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_regulatory_feature(feature, rec, id):
    """
    Convert a GenBank regulatory feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    # Mandatory qualifier
    regulatory_class = qualifiers.get("regulatory_class", [None])[0]
    if regulatory_class is None:
        raise ValueError(
            f"regulatory feature {id} is missing mandatory /regulatory_class qualifier"
        )

    so_code =  so.SO_REGULATORY_REGION.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    regulatory_entry = {
            "type": "regulatory",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "regulatory_class": regulatory_class,
            "db_xrefs": db_xrefs,
            "id": id,
        }


# Feature Key           regulatory


# Definition            any region of sequence that functions in the regulation of
#                       transcription, translation, replication, recombination, or chromatin structure;

# Mandatory qualifiers  /regulatory_class="TYPE"

# Optional qualifiers   /allele="text"
#                       /bound_moiety="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /phenotype="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment	              This feature has replaced the following Feature Keys on 15-DEC-2014:
#                       enhancer, promoter, CAAT_signal, TATA_signal, -35_signal, -10_signal,
#                       RBS, GC_signal, polyA_signal, attenuator, terminator, misc_signal.

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "bound_moiety", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(regulatory_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            regulatory_entry[flag] = True

    #  regulatory      195030925..195032349
    #                  /regulatory_class="enhancer"
    #                  /experiment="EXISTENCE:reporter gene assay evidence
    #                  [ECO:0000049][PMID:32912294]"
    #                  /note="C2 STARR-seq-only enhancer starr_03508"
    #                  /function="activates a minimal SCP1 promoter by STARR-seq
    #                  in ground-state (2iL) and metastable (SL) mouse embryonic
    #                  stem cells {active_cell/tissue: mESC(E14 +2i+LIF or
    #                  +serum+LIF)}"
    #                  /db_xref="GeneID:131296982"

    return regulatory_entry

convert_repeat_region_feature(feature, rec, id)

Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The repeat_region feature (crispr) from the Prokka GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style CRISPR feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_repeat_region_feature(feature, rec, id):
    """
    Convert a Prokka GenBank repeat_region (CRISPR) feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The repeat_region feature (crispr) from the Prokka GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style CRISPR feature.
    """

    # Coordinates (Bakta uses 1-based)
    strand = "."
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]
    rpt_family = qualifiers.get("rpt_family", [None])[0]
    rpt_type = qualifiers.get("rpt_type", [None])[0]
    rpt_unit_seq = qualifiers.get("rpt_unit_seq", [None])[0]

    # always just take the positive strand to get the NT seq (crispr repeat region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           repeat_region


# Definition            region of genome containing repeating units;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>" 
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /rpt_family="text"
#                       /rpt_type=<repeat_type>
#                       /rpt_unit_range=<base_range>
#                       /rpt_unit_seq="text"
#                       /satellite="<satellite_type>[:<class>][ <identifier>]"
#                       /standard_name="text"

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

    # Minimal Bakta-like CRISPR structure
    repeat_region_entry = {
        "type": "repeat_region",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "family": rpt_family,       # e.g., "LINE1" - should always be there
        "rpt_type": rpt_type,   
        "repeat_unit": rpt_unit_seq, # the actual consensus repeat if crispr
        "product": note, # won't be the same as Bakta as different lookup method used - but needed for the gff writing
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        # "locus": None, # no locus tag like Bakta
        "db_xrefs": db_xrefs
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"satellite", "gene",  "locus_tag", "map",  "old_locus_tag", "operon", "phenotype", "product", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(repeat_region_entry, qualifiers, single_valued, multi_valued)


    return repeat_region_entry

convert_rrna_feature(feature, rec, id)

Convert a GenBank rRNA feature to a Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_rrna_feature(feature, rec, id):
    """
    Convert a GenBank rRNA feature to a Bakta-style feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    qualifiers = feature.qualifiers

    so_code =  so.SO_RRNA.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)

# Feature Key           rRNA


# Definition            mature ribosomal RNA; RNA component of the
#                       ribonucleoprotein particle (ribosome) which assembles
#                       amino acids into proteins.

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

# Comment               rRNA sizes should be annotated with the /product
#                       qualifier.  


    rrna_entry = {
            "type": "rRNA",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "db_xrefs": db_xrefs,
            "id": id,
        }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(rrna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            rrna_entry[flag] = True


    #  rRNA            46413357..46413475
    #                  /gene="n-R5s211"
    #                  /product="5S ribosomal RNA"
    #                  /inference="COORDINATES: nucleotide
    #                  motif:Rfam:12.0:RF00001"
    #                  /inference="COORDINATES: profile:INFERNAL:1.1.1"
    #                  /note="Derived by automated computational analysis using
    #                  gene prediction method: cmsearch."
    #                  /transcript_id="XR_004936691.1"
    #                  /db_xref="GeneID:115487577"
    #                  /db_xref="RFAM:RF00001"
    #                  /db_xref="MGI:MGI:4422076"

    return rrna_entry

convert_sig_peptide_feature(feature, rec, id)

Convert a sig_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style sig_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_sig_peptide_feature(feature, rec, id):
    """
    Convert a sig_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style sig_peptide feature.
    """

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_SIGNAL_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    sig_peptide_entry = {
        "type": "sig_peptide",
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "id": id,

        # Join support
        "starts": starts,
        "stops": stops,

        # Multi-valued
        "db_xrefs": qualifiers.get("db_xref", []),

    }




# Feature Key           sig_peptide


# Definition            signal peptide coding sequence; coding sequence for an
#                       N-terminal domain of a secreted protein; this domain is
#                       involved in attaching nascent polypeptide to the
#                       membrane leader sequence;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"


    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(sig_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            sig_peptide_entry[flag] = True


    #  sig_peptide     complement(join(194768584..194768588,
    #                  194774407..194774533))
    #                  /gene="Cd46"
    #                  /gene_synonym="Mcp"
    #                  /inference="COORDINATES: ab initio prediction:SignalP:6.0"

    return sig_peptide_entry

convert_transit_peptide_feature(feature, rec, id)

Convert a transit_peptide feature to a Bakta-style feature.

mus musculus chrom 1 NC_000067

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The mRNA feature from the GBK.

required
rec

Bio.SeqRecord The record containing the sequence.

required

Returns:

Name Type Description
dict

Bakta-style transit_peptide feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_transit_peptide_feature(feature, rec, id):
    """
    Convert a transit_peptide feature to a Bakta-style feature.

    mus musculus chrom 1 NC_000067

    Parameters:
        feature: Bio.SeqFeature
            The mRNA feature from the GBK.
        rec: Bio.SeqRecord
            The record containing the sequence.

    Returns:
        dict: Bakta-style transit_peptide feature.
    """

    # seq = str(rec.seq)

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))


    qualifiers = feature.qualifiers

    so_code =  so.SO_TRANSIT_PEPTIDE.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    transit_peptide_entry = {
            "type": "transit_peptide",
            "sequence": rec.id,
            "start": start,
            "stop": stop,
            "strand": strand,
            "id": id,

            # Join support
            "starts": starts,
            "stops": stops,

            # Multi-valued
            "db_xrefs": qualifiers.get("db_xref", []),

        }



# Feature Key           transit_peptide


# Definition            transit peptide coding sequence; coding sequence for an
#                       N-terminal domain of a nuclear-encoded organellar
#                       protein; this domain is involved in post-translational
#                       import of the protein into the organelle;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note"}
    single_valued = {"allele", "gene", "locus_tag", "map",  "operon",  "old_locus_tag", "phenotype", "product", "pseudogene", "standard_name"}

    add_optional_qualifiers(transit_peptide_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo"]:
        if flag in qualifiers:
            transit_peptide_entry[flag] = True

    #  transit_peptide complement(join(180006550..180006849,
    #                  180009627..180009803))
    #                  /gene="Coq8a"
    #                  /gene_synonym="4632432J16Rik; Adck3; Cabc1; mKIAA0451"
    #                  /note="Mitochondrion.
    #                  /evidence=ECO:0000250|UniProtKB:Q8NI60; propagated from
    #                  UniProtKB/Swiss-Prot (Q60936.2)"

    return transit_peptide_entry

convert_trna_feature(feature, seq_record, id)

Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_trna_feature(feature, seq_record, id):
    """
    Convert a funannotate tRNA SeqFeature to a Bakta tRNA JSON entry.
    """

    # ------------ Location ------------

    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion

    # Handle CompoundLocation (join)
    starts = None
    stops = None

    if feature.location.__class__.__name__ == "CompoundLocation":
        starts = []
        stops = []
        for part in feature.location.parts:
            starts.append(int(part.start) + 1)
            stops.append(int(part.end))



    # ------------ Extract nt sequence ------------
    nt_seq = feature.extract(seq_record.seq)
    nt = str(nt_seq)

    # ------------ Basic qualifiers ------------
    product = feature.qualifiers.get("product", [None])[0]

    qualifiers = feature.qualifiers

    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # ------------ amino acid ------------
    # Prokka product examples:
    #   "tRNA-Trp"
    #   "tRNA-Leu"
    amino_acid = None
    if product and product.startswith("tRNA-"):
        amino_acid = product.split("-")[1]


    # ------------ anticodon ------------
    anti_codon = None

    # anticodons are in notes

    notes = feature.qualifiers.get("note", [])

    # Expect a note like: "tRNA-Ser(gga)"
    for note in notes:
        # Remove spaces for safety
        n = note.replace(" ", "")

        # Extract part inside parentheses (anticodon)
        if "(" in n and ")" in n:
            anti_codon = n.split("(")[1].split(")")[0].lower()

        # Extract amino acid:
        # tRNA-Ser(gga) → "Ser"
        if "tRNA-" in n:
            try:
                # tRNA-Ser(gga) → "Ser(gga)" → split('(')[0] → "Ser"
                aa_section = n.split("tRNA-")[1]
                aa_clean = aa_section.split("(")[0]
                amino_acid = aa_clean
            except Exception:
                pass

    # ------------ Anti-codon position detection ------------
    # Prokka doesnt have it - dont include
    # anti_codon_pos = None

    # ------------ score ------------
    # nothing in prokka
    score = None

    # ------------ db_xrefs ------------
    # doesnt exist for prokka
    db_xrefs = feature.qualifiers.get("db_xref", [])
    # add so_term
    so_term = AMINO_ACID_DICT.get(amino_acid.lower(), ('', None))[1]

    if (so_term):
        db_xrefs.append(so_term.id)

    # ------------ final Bakta-form dict ------------
    bakta_trna_entry = {
        "type": "tRNA",
        "sequence": seq_record.id,
        "start": start,
        "stop": stop,
        "strand": strand,
        "gene": "trn" + (amino_acid[0].lower() if amino_acid else "?"),
        "product": product,
        "amino_acid": amino_acid,
        "anti_codon": anti_codon,
        "score": score,
        "nt": nt,
        "db_xrefs": db_xrefs,
       #  "anti_codon_pos": anti_codon_pos,  dont include, not in output
        "locus": locus_tag,
        "id": id,
    }

# Feature Key           tRNA


# Definition            mature transfer RNA, a small RNA molecule (75-85 bases
#                       long) that mediates the translation of a nucleic acid
#                       sequence into an amino acid sequence;

# Optional qualifiers   /allele="text"
#                       /anticodon=(pos:<location>,aa:<amino_acid>,seq:<text>)
#                       /circular_RNA
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /operon="text"
#                       /product="text"
#                       /pseudo
#                       /pseudogene="TYPE"
#                       /standard_name="text"
#                       /trans_splicing

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele",  "map",    "old_locus_tag", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(bakta_trna_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["circular_RNA", "pseudo", "trans_splicing"]:
        if flag in qualifiers:
            bakta_trna_entry[flag] = flag in qualifiers

    return bakta_trna_entry

convert_utr_region_feature(feature, rec, id, three)

Convert a UTR GenBank feature to a simplified Bakta-style feature.

Parameters:

Name Type Description Default
feature

Bio.SeqFeature The UTR feature from the GBK.

required
rec

Bio.SeqRecord The full GenBank record containing the sequence.

required

Returns:

Name Type Description
dict

Simplified Bakta-style feature.

Source code in src/baktfold/io/eukaryotic_to_json.py
def convert_utr_region_feature(feature, rec, id, three):
    """
    Convert a UTR GenBank feature to a simplified Bakta-style feature.

    Parameters:
        feature: Bio.SeqFeature
            The UTR feature from the GBK.
        rec: Bio.SeqRecord
            The full GenBank record containing the sequence.

    Returns:
        dict: Simplified Bakta-style feature.
    """

    if three:
        type = "3'UTR"
        so_code =  so.SO_3UTR.id
    else:
        type = "5'UTR"
        so_code =  so.SO_5UTR.id

    # Extract location
    strand = "+" if feature.location.strand == 1 else "-"
    start = int(feature.location.start) + 1   # Bakta uses 1-based inclusive
    stop  = int(feature.location.end)         # already inclusive after conversion


    qualifiers = feature.qualifiers
    note = qualifiers.get("note", [None])[0]


    # fall back to start_stop_strand if there is no locus tag
    if 'locus_tag' in qualifiers and qualifiers['locus_tag']:
        locus_tag = qualifiers['locus_tag'][0]
    else:
        logger.warning(f"No locus_tag found for feature {id}")
        locus_tag = f"{GENOME_RANDOM_BACKUP_LOCUSTAG_STR}_{start}_{stop}"
        logger.warning(f"Generating a locus_tag: {locus_tag}")

    # always just take the positive strand to get the NT seq (UTR region)
    seq =  str(rec.seq)
    nt_seq = seq[start-1:stop]


# Feature Key           3'UTR


# Definition            1) region at the 3' end of a mature transcript (following 
#                       the stop codon) that is not translated into a protein;
#                       2) region at the 3' end of an RNA virus (following the last stop
#                       codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing



# Feature Key           5'UTR


# Definition            1) region at the 5' end of a mature transcript (preceding 
#                       the initiation codon) that is not translated into a protein;
#                       2) region at the 5' end of an RNA virus genome (preceding the first 
#                       initiation codon) that is not translated into a protein;

# Optional qualifiers   /allele="text"
#                       /db_xref="<database>:<identifier>"
#                       /experiment="[CATEGORY:]text"
#                       /function="text"
#                       /gene="text"
#                       /gene_synonym="text"
#                       /inference="[CATEGORY:]TYPE[ (same species)][:EVIDENCE_BASIS]"
#                       /locus_tag="text" (single token)
#                       /map="text"
#                       /note="text"
#                       /old_locus_tag="text" (single token)
#                       /standard_name="text"
#                       /trans_splicing

    so_code =  so.SO_REPEAT.id

    # Get existing db_xref list or default to [so.SO_CDS.id]
    db_xrefs = feature.qualifiers.get("db_xref", [so_code])

    # Append only if it’s not already present
    if so_code not in db_xrefs:
        db_xrefs.append(so_code)


    # Minimal Bakta-like structure
    utr_entry = {
        "type": type,
        "sequence": rec.id,
        "start": start,
        "stop": stop,
        "strand": strand, # matches Bakta and is required
        "product": note, 
        "nt": nt_seq, # needed for batka .ffn writeout
        "id": id, # bakta_id needed 
        "db_xrefs": db_xrefs,
        "locus": locus_tag
    }

    multi_valued = {"experiment", "function",  "gene_synonym",  "inference", "note" }
    single_valued = {"allele", "gene",   "map",  "old_locus_tag", "operon", "phenotype", "standard_name"}

    qualifiers = feature.qualifiers

    add_optional_qualifiers(utr_entry, qualifiers, single_valued, multi_valued)

    # Flags (boolean-like qualifiers)
    for flag in ["pseudo", "trans_splicing"]:
        if flag in qualifiers:
            utr_entry[flag] = flag in qualifiers

    return utr_entry

get_bakta_style_id_from_locus_tag(records)

Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first record + 2 random chars

Assumes all records will have the same locus tag prefix

Will always add 2 chars to make ID unique vs locus tag

Source code in src/baktfold/io/eukaryotic_to_json.py
def get_bakta_style_id_from_locus_tag(records):
    """
    Gets 10 char bakta-style ID tag based off the 8 char locus tag in first CDS on the first  record + 2 random chars

    Assumes all records will have the same locus tag prefix

    Will always add 2 chars to make ID unique vs locus tag
    """

    if not records:
        raise ValueError("No GenBank records found.")

    for record in records:

        for feat in record.features:
            if feat.type == "CDS":
                locus_tag_list = feat.qualifiers.get("locus_tag") # returns None if doesn't exist

                if locus_tag_list:
                    locus_tag = locus_tag_list[0]

                    if len(locus_tag) > 7:

                        locus_tag_prefix = locus_tag[:-7] # trims off _000001 from CDS

                        rand_two_chars = random_n_letter_id(2)

                        # by default  locus tag is 8 chars. So this returns a 10 char string (same as bakta defaults)

                        id_tag = f"{locus_tag_prefix}{rand_two_chars}"

                        return id_tag


                    else:
                        return random_n_letter_id(10)

                # fallback if locus_tag missing or too short
                return random_n_letter_id(10)

    # No CDS feature found at all (shouldn't happen)
    return random_n_letter_id(10)

random_n_letter_id(n=4)

generates a n letter id prefix

n=2 to append to locus tag for bakta id to make it different n=10 if the locus tag is somehow missing (should never happen)

Source code in src/baktfold/io/eukaryotic_to_json.py
def random_n_letter_id(n=4):
    """
    generates a n letter id prefix 

    n=2 to append to   locus tag  for bakta id to make it different
    n=10 if the locus tag is somehow missing (should never happen) 
    """
    return ''.join(random.choices(string.ascii_uppercase, k=n))

write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, cds_program, trna_program, rrna_program, tmrna_program, ncrna_program, bakta_version)

Writes the bakta outputs to a given path.

Parameters:

Name Type Description Default
data dict

The dictionary containing the bakta outputs.

required
features Sequence[dict]

The sequence of dictionaries containing the features.

required
features_by_sequence Sequence[dict]

The sequence of dictionaries containing the features by sequence.

required
output Path

The path to save the bakta outputs to.

required
prefix str

The prefix to use for the bakta outputs.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
euk bool

A boolean indicating whether the sequences are eukaryotic.

required
has_duplicate_locus bool

A boolean indicating whether there are duplicate loci.

required
fast bool

If True, skips AFDB step

required
translation_table str

Translation table inferred from input JSON

required
prokka bool

boolean indicating if prokka was used to do initial annotation

required
other_genbank bool

boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation

required
bakta_version dict

Dictionary of Bakta (or whatever other program) was used for the initial annotation

required

Returns:

Type Description

None.

Examples:

>>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
Source code in src/baktfold/io/io.py
def write_bakta_outputs(data: dict, features: Sequence[dict], features_by_sequence: Sequence[dict] , 
                        output: Path, prefix: str, custom_db: bool, euk: bool, has_duplicate_locus: bool,
                        fast: bool, translation_table: int, prokka: bool, other_genbank: bool,
                        cds_program: str ,trna_program: str, rrna_program: str, tmrna_program: str, ncrna_program: str, bakta_version: dict):
    """
    Writes the bakta outputs to a given path.

    Args:
      data (dict): The dictionary containing the bakta outputs.
      features (Sequence[dict]): The sequence of dictionaries containing the features.
      features_by_sequence (Sequence[dict]): The sequence of dictionaries containing the features by sequence.
      output (Path): The path to save the bakta outputs to.
      prefix (str): The prefix to use for the bakta outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      euk (bool): A boolean indicating whether the sequences are eukaryotic.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): If True, skips AFDB step
      translation_table (str): Translation table inferred from input JSON
      prokka (bool): boolean indicating if prokka was used to do initial annotation
      other_genbank (bool): boolean indicating if other genbank (prokaryotic, genbank_to) was used to do initial annotation
      bakta_version (dict): Dictionary of Bakta (or whatever other program) was used for the initial annotation

    Returns:
      None.

    Examples:
      >>> write_bakta_outputs(data, features, features_by_sequence, output, prefix, custom_db, euk, has_duplicate_locus, fast, translation_table, prokka, other_genbank, bakta_version)
    """

    #logger.info(f'selected features={len(features)}')

    logger.info('writing human readable TSV...')
    tsv_path: Path = Path(output) / f"{prefix}.tsv"
    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)

    logger.info('writing GFF3...')
    gff3_path: Path = Path(output) / f"{prefix}.gff3"
    # fix later prokka
    gff.write_features(data, features_by_sequence, gff3_path, prokka, euk, other_genbank, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing INSDC GenBank & EMBL...')
    genbank_path: Path = Path(output) / f"{prefix}.gbff"
    embl_path: Path = Path(output) / f"{prefix}.embl"
    insdc.write_features(data, features, genbank_path, embl_path, prokka, euk, other_genbank, translation_table, cds_program, trna_program, tmrna_program, rrna_program, ncrna_program)

    logger.info('writing genome sequences...')
    fna_path: Path = Path(output) / f"{prefix}.fna"
    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)

    logger.info('writing feature nucleotide sequences...')
    ffn_path: Path = Path(output) / f"{prefix}.ffn"
    fasta.write_ffn(features, ffn_path)

    logger.info('writing translated CDS sequences...')
    faa_path: Path = Path(output) / f"{prefix}.faa"
    fasta.write_faa(features, faa_path)

    # inference here is the different databases?
    annotations_path: Path = Path(output) / f"{prefix}.inference.tsv"
    if custom_db:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['Locus', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']
        if has_duplicate_locus:
            header_columns = ['Locus', 'ID', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    # Remove 'AFDBClusters' if fast is True
    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']

    # flatten all features across sequences
    all_features = [
        feat
        for features in features_by_sequence.values()
        for feat in features
    ]

    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')

    selected_features = []


    for seq_id, features in features_by_sequence.items():
        for feat in features:
            # get() ensures we don't crash if the key doesn't exist
            if 'hypothetical' in feat or 'baktfold' in feat:
                selected_features.append(feat)

    tsv.write_protein_features(selected_features, header_columns, annotations_path, custom_db, has_duplicate_locus, fast=fast)

    # write summary file

    write_summary_txt_file(output, prefix, all_features)

    logger.info('write machine readable JSON...')
    json_path: Path = Path(output) / f"{prefix}.json"
    json.write_json(data, features, json_path, bakta_version)

write_bakta_proteins_outputs(aas, output, prefix, custom_db, fast, bakta_version)

Writes the bakta protein outputs to a given path.

Parameters:

Name Type Description Default
aas Sequence[dict]

The sequence of dictionaries containing the amino acids.

required
output Path

The path to save the bakta protein outputs to.

required
prefix str

The prefix to use for the bakta protein outputs.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
fast bool

If True, skips AFDB step

required
bakta_version dict

Original Bakta version

required

Returns:

Type Description

None.

Examples:

>>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
Source code in src/baktfold/io/io.py
def write_bakta_proteins_outputs(aas: Sequence[dict], output: Path, prefix: str, custom_db: bool, fast: bool, bakta_version: dict):
    """
    Writes the bakta protein outputs to a given path.

    Args:
      aas (Sequence[dict]): The sequence of dictionaries containing the amino acids.
      output (Path): The path to save the bakta protein outputs to.
      prefix (str): The prefix to use for the bakta protein outputs.
      custom_db (bool): A boolean indicating whether a custom database is used.
      fast (bool): If True, skips AFDB step
      bakta_version (dict): Original Bakta version

    Returns:
      None.

    Examples:
      >>> write_bakta_proteins_outputs(aas, output, prefix, custom_db)
    """

    # remove fields that were mocked to avoid baktfold crashing but not in the bakta protein JSON outputs
    fields_to_remove = ['sequence', 'start', 'stop', 'strand', 'frame']

    for aa in aas:
        for f in fields_to_remove:
            aa.pop(f, None)

    annotations_path: Path = Path(output) / f"{prefix}.tsv"
    if custom_db:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH', 'Custom_DB']
    else:
        header_columns = ['ID', 'Length', 'Product', 'Swissprot', 'AFDBClusters', 'PDB', 'CATH']

    if fast:
        header_columns = [col for col in header_columns if col != 'AFDBClusters']


    logger.info(f'Exporting annotations (TSV) to: {annotations_path}')
    tsv.write_protein_features(aas, header_columns, annotations_path, custom_db, has_duplicate_locus=False, fast=fast)


    # do i combine the tophits tsvs, sort by column, add a column for db and put out as one tsv

    full_annotations_path: Path = Path(output) / f"{prefix}.json"
    logger.info(f'Full annotations (JSON): {full_annotations_path}')
    json.write_json({'features': aas}, aas, full_annotations_path, bakta_version)


    #### don't write hyps I think as tsv

    # hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
    # header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
    # hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
    # print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}')
    # tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)

    aa_output_path: Path = Path(output) / f"{prefix}.faa"
    logger.info(f'Annotated sequences (Fasta): {aa_output_path}')
    fasta.write_faa(aas, aa_output_path)

    write_summary_txt_file(output, prefix, aas)

write_foldseek_tophit(tophit_df, pdb_tophit_path)

Writes the foldseek tophits to a given path.

Parameters:

Name Type Description Default
tophit_df pd.DataFrame

The dataframe containing the foldseek tophits.

required
pdb_tophit_path Path

The path to save the foldseek tophits to.

required

Returns:

Type Description

None.

Examples:

>>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
Source code in src/baktfold/io/io.py
def write_foldseek_tophit(tophit_df: pd.DataFrame, pdb_tophit_path: Path):
    """
    Writes the foldseek tophits to a given path.

    Args:
      tophit_df (pd.DataFrame): The dataframe containing the foldseek tophits.
      pdb_tophit_path (Path): The path to save the foldseek tophits to.

    Returns:
      None.

    Examples:
      >>> write_foldseek_tophit(tophit_df, pdb_tophit_path)
    """
    logger.info(f"Saving foldseek tophits to {pdb_tophit_path}")
    tophit_df.to_csv(pdb_tophit_path, sep="\t", index=False)

map_aa_columns(feat, custom_db, has_duplicate_locus, fast)

Maps amino acid columns.

Parameters:

Name Type Description Default
feat dict

The dictionary containing the features.

required
custom_db bool

A boolean indicating whether a custom database is used.

required
has_duplicate_locus bool

A boolean indicating whether there are duplicate loci.

required
fast bool

A boolean indicating whether AFDBclusters Foldseek search should be skipped

required

Returns:

Type Description
Sequence[str]

Sequence[str]: A sequence of strings containing the mapped amino acid columns.

Examples:

>>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
['ABC', '100', 'protein', '', '', '', '']
Source code in src/baktfold/io/tsv.py
def map_aa_columns(feat: dict, custom_db: bool, has_duplicate_locus: bool, fast: bool) -> Sequence[str]:
    """
    Maps amino acid columns.

    Args:
      feat (dict): The dictionary containing the features.
      custom_db (bool): A boolean indicating whether a custom database is used.
      has_duplicate_locus (bool): A boolean indicating whether there are duplicate loci.
      fast (bool): A boolean indicating whether AFDBclusters Foldseek search should be skipped

    Returns:
      Sequence[str]: A sequence of strings containing the mapped amino acid columns.

    Examples:
      >>> map_aa_columns({'locus': 'ABC', 'length': 100, 'product': 'protein'}, False, False)
      ['ABC', '100', 'protein', '', '', '', '']
    """
    # Ensure length exists
    if 'length' not in feat:
        feat['length'] = int(len(feat['nt']) / 3)

    xrefs = feat.get('db_xrefs', [])

    # Extract dbxref groups once
    def join_filtered(prefix: str, replacement: str = None):
        """
    Joins filtered database cross-references.

    Args:
      prefix (str): The prefix to filter by.
      replacement (str): The string to replace the prefix with. Defaults to None.

    Returns:
      str: The joined filtered database cross-references.

    Examples:
      >>> join_filtered('swissprot', 'afdb_v6:')
      'afdb_v6:'
    """
        if replacement is None:
            replacement = prefix
        return ','.join(
            db.replace(replacement, '') for db in xrefs
            if prefix in db
        )

    swissprot   = join_filtered('swissprot', 'afdb_v6:')
    afdbclust   = join_filtered('afdbclusters_', 'afdb_v6:')
    pdb         = join_filtered('pdb:')
    cath        = join_filtered('cath:')
    custom_refs = join_filtered('custom:', 'custom:custom_')

    # Build the output row
    row = [feat['locus']]

    # add id if multiple CDS per Locus in that record (euks)
    if has_duplicate_locus:
        row.append(feat['id'])

    row.extend([
        str(feat['length']),
        feat['product'],
        swissprot,
    ])

    # Only add AFDBClusters if not in fast mode
    if not fast:
        row.append(afdbclust)

    # Always add these
    row.extend([
        pdb,
        cath,
    ])

    if custom_db:
        row.append(custom_refs)

    return row

write_feature_inferences(sequences, features_by_sequence, tsv_path)

Export feature inference statistics in TSV format.

Source code in src/baktfold/io/tsv.py
def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export feature inference statistics in TSV format."""
    logger.info('write tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                    if('ups' in feat or 'ips' in feat):
                        query_cov = 1
                        subject_cov = 1
                        identity = 1
                        evalue = 0
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][DB_UPS_COL_UNIPARC]}"
                    elif('psc' in feat or 'pscc' in feat):
                        psc_type = 'psc' if 'psc' in feat else 'pscc'
                        query_cov = feat[psc_type]['query_cov']
                        subject_cov = feat[psc_type].get('subject_cov', -1)
                        identity = feat[psc_type]['identity']
                        score = feat[psc_type].get('score', -1)
                        evalue = feat[psc_type].get('evalue', -1)
                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][DB_PSCC_COL_UNIREF50]}"
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'],
                            f"{score:0.1f}" if score != None else '-',
                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                    fh.write('\t'.join(
                        [
                            feat['sequence'] if 'sequence' in feat else feat['contig'],  # <1.10.0 compatibility
                            feat['type'],
                            str(feat['start']),
                            str(feat['stop']),
                            feat['strand'],
                            feat['locus'] if 'locus' in feat else '-',
                            f"{feat['score']:0.1f}",
                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
                            accession
                        ])
                    )
                    fh.write('\n')
    return

write_features(sequences, features_by_sequence, tsv_path)

Export features in TSV format.

Source code in src/baktfold/io/tsv.py
def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
    """Export features in TSV format."""
    logger.info(f'write feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')
        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n')

        for seq in sequences:
            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                feat_type = feat['type']
                if(feat_type == bc.FEATURE_GAP):
                    feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP

                gene = feat['gene'] if feat.get('gene', None) else ''
                product = feat.get('product', '')
                if(bc.PSEUDOGENE in feat):
                    product = f"(pseudo) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_5_PRIME):
                    product = f"(5' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_3_PRIME):
                    product = f"(3' truncated) {product}"
                elif(feat.get('truncated', '') == bc.FEATURE_END_BOTH):
                    product = f"(partial) {product}"

                def s(x):
                    return '' if x is None else str(x)

                fh.write('\t'.join(
                    [
                        seq_id,
                        feat_type,
                        str(feat['start']),
                        str(feat['stop']),
                        str(feat['strand']),
                        s(feat.get('locus')), # handles None → ''
                        s(gene),        # handles None → ''
                        s(product),     # handles None → ''
                        ', '.join(sorted(feat.get('db_xrefs', [])))
                    ])
                )
                fh.write('\n')
                if(feat_type == bc.FEATURE_CRISPR):
                    i = 0
                    # spacers and repeats wont exist if Prokka input
                    spacers = feat.get('spacers', [])
                    repeat = feat.get('repeat', [])

                    if len(spacers) > 0 and len(repeat) > 0: 
                    # if not - will just skip
                        while i < len(feat['spacers']):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
                            spacer = feat['spacers'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                            fh.write('\n')
                            i += 1
                        if(len(feat['repeats']) - 1 == i):
                            repeat = feat['repeats'][i]
                            fh.write('\t'.join([seq_id, bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                            fh.write('\n')
    return

write_hypotheticals(hypotheticals, tsv_path)

Export hypothetical information in TSV format.

Source code in src/baktfold/io/tsv.py
def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
    """Export hypothetical information in TSV format."""
    logger.info('write hypothetical tsv: path=%s', tsv_path)

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold v{cfg.version}, https://github.com/oschwengers/bakta\n')
        #fh.write(f"#Database v{cfg.db_info['major']}.{cfg.db_info['minor']}, https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('#Sequence Id\tStart\tStop\tStrand\tLocus Tag\tMol Weight [kDa]\tIso El. Point\tPfam hits\tDbxrefs\n')
        for hypo in hypotheticals:
            pfams = [f"{pfam['id']}|{pfam['name']}" for pfam in hypo.get('pfams', [])]
            seq_stats = hypo['seq_stats']
            mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA'
            iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA'
            seq_id = hypo['sequence'] if 'sequence' in hypo else hypo['contig']  # <1.10.0 compatibility
            fh.write(f"{seq_id}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
    return

write_protein_features(features, header_columns, tsv_path, custom_db, has_duplicate_locus, fast)

Export protein features in TSV format.

Source code in src/baktfold/io/tsv.py
def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], tsv_path: Path, custom_db: bool, has_duplicate_locus: bool, fast: bool):
    """Export protein features in TSV format."""
    logger.info(f'write protein feature tsv: path={tsv_path}')

    with tsv_path.open('wt') as fh:
        fh.write(f'#Annotated with Baktfold (v{cfg.version}): https://github.com/gbouras13/baktfold\n')
        #fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
        fh.write('\t'.join(header_columns))
        fh.write('\n')
        for feat in features:
            columns = map_aa_columns(feat, custom_db, has_duplicate_locus, fast)
            fh.write('\t'.join(columns))
            fh.write('\n')
    return

parse_protein_input(input_path, faa_path)

handles regular FASTA and gzipped returns cds_dict

Source code in src/baktfold/io/fasta_in.py
def parse_protein_input(input_path, faa_path):
    """
    handles regular FASTA and gzipped 
    returns cds_dict
    """

    # handles regular FASTA and gzipped 
    try:
        if input_path == '':
            raise ValueError('File path argument must be non-empty')
        input_path = Path(input_path).resolve()
        fasta_flag = is_fasta(input_path)
        if fasta_flag:
            logger.info('FASTA input format detected.')
        else:
            logger.info('Bakta JSON input format detected. Only hypothetical proteins from the Bakta JSON input file will be annotated.')
    except:
        logger.error(f'ERROR: annotation file {input_path} not valid!')

    try:
        if fasta_flag:
            logger.info('Attempting to parse input protein sequences as .faa format ...')
            aas = fasta.import_sequences(input_path, False, False)
            bakta_version = {}
        else:
            logger.info('Attempting to parse input protein sequences as Bakta JSON format ...')
            aas, hypotheticals, bakta_version = parse_json_input(input_path, False, False, protein_json_flag=True)
        logger.info(f'Imported sequences={len(aas)}')
    except:
        logger.error('ERROR: wrong file format or unallowed characters in amino acid sequences!')

    mock_start = 1
    for aa in aas:  # rename and mock feature attributes to reuse existing functions
        aa['type'] = bc.FEATURE_CDS
        aa['locus'] = aa['id']
        aa['sequence'] = '-'
        aa['start'] = mock_start
        aa['stop'] = mock_start + aa['length'] - 1
        aa['strand'] = bc.STRAND_UNKNOWN
        aa['frame'] = 1
        mock_start += 100


    if fasta_flag:
        with faa_path.open('wt') as fh:
            for aa in aas:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")
    else: # write hypothetical proteins to file if JSON input
        with faa_path.open('wt') as fh:
            for aa in hypotheticals:
                fh.write(f">{aa['locus']}\n{aa['aa']}\n")

    logger.info('Parsing complete')

    return aas, bakta_version

encode_annotations(annotations)

Encodes annotations into a string.

Parameters:

Name Type Description Default
annotations dict

A dictionary containing the annotations.

required

Returns:

Name Type Description
str str

The encoded annotations.

Examples:

>>> encode_annotations({
    'ID': 'EHICP_3230_sigpep',
    'Name': 'signal peptide',
    'product': 'signal peptide',
    'score': 0.5,
    'Parent': 'EHICP_3230'
})
'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
Source code in src/baktfold/io/gff.py
def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str:
    """
    Encodes annotations into a string.

    Args:
      annotations (dict): A dictionary containing the annotations.

    Returns:
      str: The encoded annotations.

    Examples:
      >>> encode_annotations({
          'ID': 'EHICP_3230_sigpep',
          'Name': 'signal peptide',
          'product': 'signal peptide',
          'score': 0.5,
          'Parent': 'EHICP_3230'
      })
      'ID=EHICP_3230_sigpep;Name=signal peptide;product=signal peptide;score=0.5;Parent=EHICP_3230'
    """
    annotation_strings = []
    for key, val in annotations.items():
        if(type(val) is list):
            if(len(val) >= 1):
                val = [encode_attribute(k) for k in val]
                annotation = f"{key}={','.join(val)}"
                annotation_strings.append(annotation)
        else:
            annotation_strings.append(f'{key}={encode_attribute(val)}')
    return ';'.join(annotation_strings)

encode_attribute(product)

Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md

Source code in src/baktfold/io/gff.py
def encode_attribute(product: str) -> str:
    """Replace special characters forbidden in column 9 of the GFF3 format: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md"""
    product = str(product)
    product = product.replace('%', '%25')
    product = product.replace(';', '%3B')
    product = product.replace('=', '%3D')
    product = product.replace('&', '%26')
    product = product.replace(',', '%2C')
    return product

write_euk_cds_feature(fh, seq_id, feat)

Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

Parameters

fh : file-handle seq_id : str

dict-like feature with keys:

"start", "stop", "strand", "locus", "starts", "stops"

Source code in src/baktfold/io/gff.py
def write_euk_cds_feature(fh, seq_id, feat):
    """
    Write a eukaryotic CDS feature to GFF3 with multiple CDS parts.

    Parameters
    ----------
    fh : file-handle
    seq_id : str
    feat : dict-like feature with keys:
            "start", "stop", "strand", "locus", "starts", "stops"
    """

    strand = feat.get("strand", "+")
    locus = feat.get("locus", "unknown")

    transcript_id = f"{locus}-T1"
    cds_id = f"{transcript_id}.cds"

    starts = feat.get("starts")
    stops = feat.get("stops")

    # -------------------------------
    # 1. Determine CDS sub-coordinates
    # -------------------------------
    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        cds_coords = list(zip(starts, stops))
    else:
        cds_coords = [(feat["start"], feat["stop"])]

    # -------------------------------
    # 2. Reverse order for negative strand
    # -------------------------------
    if strand == "-":
        cds_coords.reverse()

    # -------------------------------
    # 3. Emit CDS lines with correct phase
    # -------------------------------
    offset = 0

    for i, (cds_start, cds_stop) in enumerate(cds_coords, start=1):

        length = cds_stop - cds_start + 1
        phase = offset % 3
        offset += length

        attr = f"ID={cds_id}-{i};Parent={transcript_id}"

        fh.write(
            f"{seq_id}\tbaktfold\tCDS\t{cds_start}\t{cds_stop}"
            f"\t.\t{strand}\t{phase}\t{attr}\n"
        )

write_euk_repeat_region_feature(fh, seq_id, feat)

Writes a repeat region feature to a file.

Parameters:

Name Type Description Default
fh file

The file handle to write to.

required
seq_id str

The sequence ID.

required
feat dict

A dictionary containing the feature information.

required

Returns:

Type Description

None

Examples:

>>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
    "type": "repeat_region",
    "sequence": "DS571531.1",
    "start": 1470,
    "stop": 1716,
    "strand": "?",
    "family": "LINE2",
    "rpt_type": null,
    "repeat_unit": null,
    "product": null,
    "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
    "id": "EHICP_3230",
    "db_xrefs": [
        "SO:0000657"
    ]
})
Source code in src/baktfold/io/gff.py
def write_euk_repeat_region_feature(fh, seq_id, feat):
    """
    Writes a repeat region feature to a file.

    Args:
      fh (file): The file handle to write to.
      seq_id (str): The sequence ID.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_euk_repeat_region_feature(fh, 'DS572673.1', {
          "type": "repeat_region",
          "sequence": "DS571531.1",
          "start": 1470,
          "stop": 1716,
          "strand": "?",
          "family": "LINE2",
          "rpt_type": null,
          "repeat_unit": null,
          "product": null,
          "nt": "AATAAAATCATATCAGAAATAAAAAGAATGAAAATAAACAAATTAAAGAAAATAATTATAAAATTAATAAACGATATTTAAATGAAAGAAAATAGAGAATATGTAATAAGTACAAATGGTTCATTCATTAATAAGAAATTAACAATAATAAAATAGAGAATATTGATTATAAAAAGAAATATATTTCTCAAAACAGTAGAGATACAAAAAGAATAGATATGAAATAAATATTAATTCTAAAATACTC",
          "id": "EHICP_3230",
          "db_xrefs": [
              "SO:0000657"
          ]
      })
    """

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    id = feat['sequence']

    attrs = {
        "ID": f"{id}:{start}..{stop}",
        "gbkey": "repeat_region"
    }

    if feat.get('family') is not None:
        attrs["rpt_family"] = feat.get('family')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\trepeat_region\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_euk_trna_feature(fh, seq_id, feat)

Write a tRNA feature to GFF3 with a top-level line and single exon.

Parameters

file-like

Open file handle to write GFF lines.

str

Sequence/contig ID.

SeqFeature

Biopython SeqFeature object of type 'tRNA'.

Notes

  • Generates one tRNA line and one exon line.
  • Includes optional 'product' qualifier.
Source code in src/baktfold/io/gff.py
def write_euk_trna_feature(fh, seq_id, feat):
    """
    Write a tRNA feature to GFF3 with a top-level line and single exon.

    Parameters
    ----------
    fh : file-like
        Open file handle to write GFF lines.
    seq_id : str
        Sequence/contig ID.
    feat : SeqFeature
        Biopython SeqFeature object of type 'tRNA'.

    Notes
    -----
    - Generates one tRNA line and one exon line.
    - Includes optional 'product' qualifier.
    """
    start = int(feat['start'])
    stop  = int(feat['stop'])

    strand = feat['strand']

    locus = feat['locus']

    trna_id = f"{locus}-T1"

    # Top-level tRNA attributes
    attrs = {
        "ID": trna_id,
        "Parent": locus
    }

    attrs = {}

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    # Write top-level tRNA line
    fh.write(f"{seq_id}\tbaktfold\ttRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    # Write exon line (tRNA single-exon)
    exon_id = f"{trna_id}.exßon1"
    exon_attrs = f"ID={exon_id};Parent={trna_id}"
    fh.write(f"{seq_id}\tbaktfold\texon\t{start}\t{stop}\t.\t{strand}\t.\t{exon_attrs}\n")

write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

Write a 'utr' feature.

Source code in src/baktfold/io/gff.py
def write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False):
    """Write a 'utr' feature."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    locus = feat['locus']

    # Count occurrences for this locus
    count = locus_counter.get(locus, 0) + 1
    locus_counter[locus] = count

    # Construct ID with suffix -2, -3, etc.
    # For first entry we keep ID=locus (no -1)
    if count == 1:
        utr_id = locus
    else:
        utr_id = f"{locus}-{count}"

    # Top-level mRNA line
    attrs = {
        "ID": f"{utr_id}",
        "Parent": f"{locus}",
    }

# CAMXCT020000566.1	EMBL	three_prime_UTR	84568	84617	.	-	.	ID=id-C1SCF055_LOCUS8420;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr3p1%3B~source:feature;gbkey=3'UTR;locus_tag=C1SCF055_LOCUS8420
# CAMXCT020000566.1	EMBL	five_prime_UTR	136251	136259	.	-	.	ID=id-C1SCF055_LOCUS8420-2;Parent=gene-C1SCF055_LOCUS8420;Note=ID:SCF055_s1507_g28601.utr5p1%3B~source:feature;gbkey=5'UTR;locus_tag=C1SCF055_LOCUS8420

    if feat.get('Note') is not None:
        attrs["Note"] = feat.get('note')


    attrs["gbkey"] = "3'UTR" if three else "5'UTR"

    if feat.get('Note') is not None:
        attrs["locus_tag"] = feat.get('locus')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    if three:
        gene_tag = 'three_prime_UTR'
    else:
        gene_tag = 'five_prime_UTR'

    fh.write(f"{seq_id}\tbaktfold\t{gene_tag}\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_features(data, features_by_sequence, gff3_path, prokka=False, euk=False, other_genbank=False, cds_tool='Prodigal:2.6', trna_program='tRNAscan-SE:2.0.12', tmrna_program='Aragorn', rrna_program='Infernal', ncrna_program='Infernal')

Export features in GFF3 format.

Source code in src/baktfold/io/gff.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path, prokka: bool = False, euk: bool = False, other_genbank: bool = False, cds_tool: str = "Prodigal:2.6", trna_program: str = "tRNAscan-SE:2.0.12", tmrna_program: str = "Aragorn", rrna_program: str = "Infernal", ncrna_program: str = "Infernal"):
    """Export features in GFF3 format."""
    logger.info(f'write features: path={gff3_path}')

    with gff3_path.open('wt') as fh:
        fh.write('##gff-version 3\n')  # GFF version
        fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version

        if(data['genome'].get('taxon', None)):  # write organism info
            fh.write(f"# organism {data['genome']['taxon']}\n")

        fh.write('# Annotated with Baktfold\n')
        fh.write(f'# Software: v{cfg.version}\n')
        fh.write(f"# Database: v{cfg.version}\n") # fix later
        #fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
        fh.write(f'# DOI: {bc.BAKTFOLD_DOI}\n')
        fh.write(f'# URL: {bc.BAKTFOLD_URL}\n')

        for seq in data['sequences']:  # write features
            if euk:
                locus_counter = {} # for UTRs

            fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region

            # write landmark region
            annotations = {
                'ID': seq['id'],
                'Name': seq['id']
            }
            if(seq['topology'] == bc.TOPOLOGY_CIRCULAR):
                annotations['Is_circular'] = 'true'
            annotations = encode_annotations(annotations)
            fh.write(f"{seq['id']}\tBaktfold\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")

            for feat in features_by_sequence[seq['id']]:
                seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                start = feat['start']
                stop = feat['stop']
                if('edge' in feat):
                    stop += seq['length']

                # euks
                if euk:
                    if(feat['type'] == bc.FEATURE_REPEAT):
                        write_euk_repeat_region_feature(fh, seq_id, feat)

                    if(feat['type'] == bc.FEATURE_5UTR or feat['type'] == bc.FEATURE_3UTR):
                        if feat['type'] == bc.FEATURE_3UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=True)
                        elif feat['type'] == bc.FEATURE_5UTR:
                            write_euk_utr_feature(fh, seq_id, feat, locus_counter, three=False)

                if(feat['type'] == bc.FEATURE_T_RNA):

                    if euk:
                        write_euk_trna_feature(fh, seq_id, feat)
                    else:
                        trna_tool = "tRNAscan-SE"
                        if prokka:
                            trna_tool = "Aragorn"
                        if other_genbank:
                            trna_tool = trna_program

                        annotations = {
                            'ID': feat['locus'],
                            'Name': feat['product'],
                            'locus_tag': feat['locus'],
                            'product': feat['product'],
                            'Dbxref': feat.get('db_xrefs', [])
                        }
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('anti_codon', False)):
                            annotations['anti_codon'] = feat['anti_codon']
                        if(feat.get('amino_acid', False)):
                            annotations['amino_acid'] = feat['amino_acid']
                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'profile:tRNAscan:2.0'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{trna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{trna_tool}\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_TM_RNA):
                    # both prokka and bakta use Aragorn

                    tmrna_tool = "Aragorn"
                    if other_genbank:
                        tmrna_tool = tmrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('tag' in feat):
                        annotations['tag_peptide'] = feat['tag']['aa']
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['inference'] = 'profile:aragorn:1.2'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        if('tag' in feat):
                            annotations['tag_peptide'] = f"{feat['tag']['start']}..{feat['tag']['stop']}" if feat['strand'] == bc.STRAND_FORWARD else f"complement({feat['tag']['start']}..{feat['tag']['stop']})"
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_R_RNA):
                    rrna_tool = "Infernal"
                    if prokka:
                        rrna_tool = "barrnap"
                    if other_genbank:
                        rrna_tool = rrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{rrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{rrna_tool}\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA):
                    # both prokka and bakta use infernal for ncrna
                    ncrna_tool = "Infernal"
                    if other_genbank:
                        ncrna_tool = ncrna_program

                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'gene': feat.get('gene', []),
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_NC_RNA_CLASS] = insdc.select_ncrna_class(feat)
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'gene': feat['gene']
                        }
                        if(ba.RE_GENE_SYMBOL.fullmatch(feat['gene'])):  # discard non-standard ncRNA gene symbols
                            gene_annotations['gene'] = feat['gene']
                        else:
                            annotations.pop('gene', None)
                        if('truncated' in feat):
                            gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\t{ncrna_tool}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    if other_genbank or prokka:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['strand']}\t.\t{annotations}\n")
                    else:
                        fh.write(f"{seq_id}\t{ncrna_tool}\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if('truncated' in feat):
                        annotations[bc.INSDC_FEATURE_PSEUDO] = True
                    if(cfg.compliant):
                        for rfam_id in [dbxref.split(':')[1] for dbxref in feat['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
                            annotations['inference'] = f'profile:Rfam:{rfam_id}'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_CRISPR):
                    crispr_tool = "PILER-CR"
                    if prokka:
                        crispr_tool = "MinCED"
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product'],
                        'product': feat['product']
                    }
                    feat_type = so.SO_CRISPR.name
                    if(cfg.compliant):
                        feat_type = bc.INSDC_FEATURE_REPEAT_REGION
                        annotations['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                        annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                        annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{crispr_tool}\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                    if(not cfg.compliant):
                        i = 0
                        # spacers and repeats wont exist if Prokka input
                        spacers = feat.get('spacers', [])
                        repeat = feat.get('repeat', [])
                        if len(spacers) > 0 and len(repeat) > 0: 
                            while i < len(feat['spacers']):
                                repeat = feat['repeats'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_repeat_{i+1}",
                                    'Parent': feat['id']
                                }
                                annotations = encode_annotations(annotations)
                                # will always be PILER here as prokka won't have any
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                                spacer = feat['spacers'][i]
                                annotations = {
                                    'ID': f"{feat['id']}_spacer_{i+1}",
                                    'Parent': feat['id'],
                                    'sequence': spacer['sequence']
                                }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                                i += 1
                            if(len(feat['repeats']) - 1 == i):
                                repeat = feat['repeats'][i]
                                annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                                annotations = encode_annotations(annotations)
                                fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                elif feat['type'] == bc.FEATURE_CDS:
                    if euk:
                        write_euk_cds_feature(fh, seq_id, feat)
                    else:
                        annotations = {
                            'ID': feat.get('locus'),
                            'Name': feat.get('product'),
                            'locus_tag': feat.get('locus'),
                            'product': feat.get('product'),
                            'Dbxref': feat.get('db_xrefs', [])  # default to empty list if db_xrefs doesn't exist
                        }
                        if(bc.PSEUDOGENE in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                        elif('truncated' in feat):
                            annotations[bc.INSDC_FEATURE_PSEUDO] = True
                        if(feat.get('gene', None)):  # add gene annotation if available
                            annotations['gene'] = feat['gene']
                        source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Pyrodigal'
                        if prokka: 
                            source = 'Prodigal'

                        if other_genbank:
                            source = cds_tool

                        if(cfg.compliant):
                            gene_id = f"{feat['locus']}_gene"
                            annotations['Parent'] = gene_id
                            annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'ab initio prediction:Pyrodigal:3.5'
                            annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                            annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                            if(ec_number is not None):
                                annotations['ec_number'] = ec_number
                            gene_annotations = {
                                'ID': gene_id,
                                'locus_tag': feat['locus']
                            }
                            if(feat.get('gene', None)):
                                gene_annotations['gene'] = feat['gene']
                            if(bc.PSEUDOGENE in feat):
                                gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                            gene_annotations = encode_annotations(gene_annotations)
                            fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                        if('exception' in feat):
                            ex = feat['exception']
                            pos = f"{ex['start']}..{ex['stop']}"
                            if(feat['strand'] == bc.STRAND_REVERSE):
                                pos = f"complement({pos})"
                            annotations['transl_except']=f"(pos:{pos},aa:{ex['aa']})"
                            notes = annotations.get('Note', [])
                            notes.append(f"codon on position {ex['codon_position']} is a {ex['type']} codon")
                            if('Notes' not in annotations):
                                annotations['Note'] = notes
                        annotations = encode_annotations(annotations)
                        fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                        if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                            write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_SORF):
                    annotations = {
                        'ID': feat['locus'],
                        'Name': feat['product'],
                        'locus_tag': feat['locus'],
                        'product': feat['product'],
                        'Dbxref': feat.get('db_xrefs', [])
                    }
                    if(feat.get('gene', None)):  # add gene annotation if available
                        annotations['gene'] = feat['gene']
                    if(cfg.compliant):
                        gene_id = f"{feat['locus']}_gene"
                        annotations['Parent'] = gene_id
                        annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                        annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                        if(ec_number is not None):
                            annotations['ec_number'] = ec_number
                        gene_annotations = {
                            'ID': gene_id,
                            'locus_tag': feat['locus'],
                            'inference': 'ab initio prediction:Bakta'
                        }
                        if(feat.get('gene', None)):
                            gene_annotations['gene'] = feat['gene']
                        gene_annotations = encode_annotations(gene_annotations)
                        fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                    if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                        write_signal_peptide(fh, feat)
                elif(feat['type'] == bc.FEATURE_GAP):
                    gap_tool="Bakta"
                    if prokka:
                        gap_tool="Prokka"
                    annotations = {
                        'ID': feat['id'],
                        'Name': f"gap ({feat['length']} bp)",
                        'product': f"gap ({feat['length']} bp)"
                    }
                    annotations = encode_annotations(annotations)
                    fh.write(f"{seq_id}\t{gap_tool}\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIC):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIV):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_ORIT):
                    annotations = {
                        'ID': feat['id'],
                        'Name': feat['product']
                    }
                    if(cfg.compliant):
                        annotations['Note'] = feat['product']
                    else:
                        annotations['product'] = feat['product']
                        annotations['inference'] = 'similar to DNA sequence'
                    annotations = encode_annotations(annotations)
                    feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                elif(feat['type'] == bc.FEATURE_GENE):
                    write_gene_feature(fh, seq_id, feat)
                elif(feat['type'] == bc.FEATURE_MRNA):
                    write_mrna_feature(fh, seq_id, feat)

        if(not cfg.compliant):
            fh.write('##FASTA\n')
            for seq in data['sequences']:  # write sequences
                fh.write(f">{seq['id']}\n")
                seq_nt = seq['nt'] if 'nt' in seq else seq['sequence']  # <1.10.0 compatibility
                fh.write(fasta.wrap_sequence(seq_nt))
    return

write_gene_feature(fh, seq_id, feat)

Write a 'gene' feature including fuzzy boundaries.

Source code in src/baktfold/io/gff.py
def write_gene_feature(fh, seq_id, feat):
    """Write a 'gene' feature including fuzzy boundaries."""
    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    attrs = {
        "ID": f"{locus}"
    }

    if feat.get('gene') is not None:
        attrs["Name"] = feat.get('gene')

    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tgene\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

write_mrna_feature(fh, seq_id, feat)

Write mRNA + implied exons based on join() structure.

Source code in src/baktfold/io/gff.py
def write_mrna_feature(fh, seq_id, feat):
    """Write mRNA + implied exons based on join() structure."""

    start = int(feat['start'])
    stop  = int(feat['stop'])
    strand = feat['strand']

    # fall back if there is no locus tag
    locus = feat.get('locus') or f"{seq_id}_{start}_{stop}_{strand}"

    mrna_id = f"{locus}-T1"

    # Top-level mRNA line
    attrs = {
        "ID": mrna_id,
        "Parent": f"{locus}",
    }

    product = feat.get("product", [])

    if product:

        key = "product"         
        if isinstance(product, list):
            if len(product) == 1:
                attrs[key] = str(product[0])
            else:
                attrs[key] = ",".join(str(v) for v in product)
        else:
            attrs[key] = str(product)


    # Ensure db_xrefs exists and is a list
    db_xrefs = feat.get("db_xrefs", [])

    # Access note safely
    note = feat.get("note", None)


    if db_xrefs:

        key = "Dbxref"         
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)

    if note:

        key = "note"         # <-- you must define this
        if isinstance(db_xrefs, list):
            if len(db_xrefs) == 1:
                attrs[key] = str(db_xrefs[0])
            else:
                attrs[key] = ",".join(str(v) for v in db_xrefs)
        else:
            # if somehow not a list, just convert to string
            attrs[key] = str(db_xrefs)


    attr_str = ";".join(f"{k}={v}" for k, v in attrs.items())

    fh.write(f"{seq_id}\tbaktfold\tmRNA\t{start}\t{stop}\t.\t{strand}\t.\t{attr_str}\n")

    starts = feat.get("starts")
    stops  = feat.get("stops")
    strand = feat.get("strand")
    seq_id = feat.get("sequence")

    if (
        isinstance(starts, list)
        and isinstance(stops, list)
        and len(starts) == len(stops)
        and len(starts) > 0
    ):
        # For minus strand, exons must be written in reverse order (5'→3')
        if strand == "-":
            exon_parts = list(zip(starts, stops))
        else:
            exon_parts = list(zip(starts, stops))

        # Exons must be numbered in biological order (5' to 3')
        if strand == "-":
            exon_parts = exon_parts[::-1]   # reverse order

        # Write each exon to GFF
        for idx, (ex_start, ex_stop) in enumerate(exon_parts, start=1):
            exon_id = f"{mrna_id}.exon{idx}"
            exon_attrs = f"ID={exon_id};Parent={mrna_id}"
            fh.write(
                f"{seq_id}\tbaktfold\texon\t{ex_start}\t{ex_stop}\t.\t{strand}\t.\t{exon_attrs}\n"
            )
    else:
        # Single exon (no starts/stops provided)
        exon_start = feat["start"]
        exon_stop = feat["stop"]
        exon_id = f"{mrna_id}.exon1"
        exon_attrs = f"ID={exon_id};Parent={mrna_id}"

        fh.write(
            f"{seq_id}\tbaktfold\texon\t{exon_start}\t{exon_stop}"
            f"\t.\t{feat['strand']}\t.\t{exon_attrs}\n"
        )

write_signal_peptide(fh, feat)

Writes a signal peptide feature to a file.

Parameters:

Name Type Description Default
fh file

The file handle to write to.

required
feat dict

A dictionary containing the feature information.

required

Returns:

Type Description

None

Examples:

>>> write_signal_peptide(fh, {
    'locus': 'EHICP_3230',
    'sequence': 'DS571531.1',
    'strand': '+',
    'signal_peptide': {
        'start': 1,
        'stop': 20,
        'score': 0.5
    }
})
Source code in src/baktfold/io/gff.py
def write_signal_peptide(fh, feat: dict):  # <1.10.0 compatibility
    """
    Writes a signal peptide feature to a file.

    Args:
      fh (file): The file handle to write to.
      feat (dict): A dictionary containing the feature information.

    Returns:
      None

    Examples:
      >>> write_signal_peptide(fh, {
          'locus': 'EHICP_3230',
          'sequence': 'DS571531.1',
          'strand': '+',
          'signal_peptide': {
              'start': 1,
              'stop': 20,
              'score': 0.5
          }
      })
    """
    sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
    annotations = {
        'ID': f"{feat['locus']}_sigpep",
        'Name': 'signal peptide',
        'product': 'signal peptide',
        'score': sig_peptide['score'],
        'Parent': feat['locus']
    }
    annotations = encode_annotations(annotations)
    seq_id = feat['sequence'] if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
    fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")

export_sequences(sequences, fasta_path, description=False, wrap=False)

Write sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
    """Write sequences to Fasta file."""
    logger.info(f'write genome sequences: path={fasta_path}, description={description}, wrap={wrap}')

    with fasta_path.open('wt') as fh:
        for seq in sequences:
            if(description):
                fh.write(f">{seq['id']} {seq['description']}\n")
            else:
                fh.write(f">{seq['id']}\n")
            if(wrap):
                fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence']))  # <1.10.0 compatibility
            else:
                fh.write(seq['nt'])
                fh.write('\n')

import_sequences(sequences_path, is_genomic=True, is_dna=True)

Import raw sequences from Fasta file.

Source code in src/baktfold/io/fasta.py
def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
    """Import raw sequences from Fasta file."""
    sequences = []
    with xopen(str(sequences_path), threads=0) as fh:
        for record in SeqIO.parse(fh, 'fasta'):

            rid = record.id

            if "~PIPE~" in rid:
                logger.error(
                    f"Your proteins FASTA header has ~PIPE~ in the header"
                    "Please remove all instances of ~PIPE~ before running Baktfold as this creates downstream issues with Foldseek"
                )
            else:
                rid = rid.replace("|", "~PIPE~")

            sequence = {
                'id': rid,
                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else ''
            }

            raw_sequence = str(record.seq).upper()
            if('-' in raw_sequence):
                dash_count = raw_sequence.count('-')
                raw_sequence = raw_sequence.replace('-', '')
                logger.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count)
            if(is_dna):
                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                    raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
                sequence['nt'] = raw_sequence
            else:
                if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
                    raw_sequence = raw_sequence[:-1]
                    logger.warning('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence)
                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                    logger.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                    raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
                sequence['aa'] = raw_sequence
            sequence['length'] = len(raw_sequence)
            if(is_genomic):
                sequence['complete'] = False
                sequence['type'] = bc.REPLICON_CONTIG
                sequence['topology'] = bc.TOPOLOGY_LINEAR
            logger.info(
                f"imported: id={record.id}, length={sequence['length']}, description={sequence['description']}, genomic={is_genomic}, dna={is_dna}"
            )   
            sequences.append(sequence)
    return sequences

wrap_sequence(sequence)

Wraps a sequence into lines of 60 characters.

Parameters:

Name Type Description Default
sequence str

The sequence to wrap.

required

Returns:

Name Type Description
str

The wrapped sequence.

Examples:

>>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
'ARNDCQEGHILKMFPOSUTWYVBZXJ\n'
Notes

This function is used to format sequences in FASTA files.

Source code in src/baktfold/io/fasta.py
def wrap_sequence(sequence: str):
    """
    Wraps a sequence into lines of 60 characters.

    Args:
      sequence (str): The sequence to wrap.

    Returns:
      str: The wrapped sequence.

    Examples:
      >>> wrap_sequence('ARNDCQEGHILKMFPOSUTWYVBZXJ')
      'ARNDCQEGHILKMFPOSUTWYVBZXJ\\n'

    Notes:
      This function is used to format sequences in FASTA files.
    """
    lines = []
    for i in range(0, len(sequence), FASTA_LINE_WRAPPING):
        lines.append(sequence[i:i + FASTA_LINE_WRAPPING])
    return '\n'.join(lines) + '\n'

write_faa(features, faa_path)

Write translated CDS sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def write_faa(features: Sequence[dict], faa_path: Path):
    """Write translated CDS sequences to Fasta file."""
    logger.info(f'write translated CDS/sORF: path={faa_path}')
    with faa_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF):
                fh.write(f">{feat['locus']} {feat['product']}\n{feat['aa']}\n")

write_ffn(features, ffn_path)

Write annotated nucleotide sequences to Fasta file.

Source code in src/baktfold/io/fasta.py
def write_ffn(features: Sequence[dict], ffn_path: Path):
    """Write annotated nucleotide sequences to Fasta file."""
    logger.info(f'write feature nucleotide sequences: path={ffn_path}')
    with ffn_path.open('wt') as fh:
        for feat in features:
            if(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION, bc.FEATURE_CRISPR, bc.FEATURE_CDS, bc.FEATURE_SORF, bc.FEATURE_ORIC, bc.FEATURE_ORIV, bc.FEATURE_ORIT]):
                identifier = feat['locus'] if 'locus' in feat else feat['id']
                if(feat.get('product', '') != ''):
                    fh.write(f">{identifier} {feat['product']}\n{feat['nt']}\n")
                else:
                    fh.write(f">{identifier}\n{feat['nt']}\n")