Python script to split a DNA sequence into words of varying lengths

LEGE — Thu, 02 Jan 2025 11:31:22 -0600

# Script to split a DNA sequence into words of varying lengths
def split_dna_into_words(dna_sequence, min_length, max_length):
    """
    Splits a DNA sequence into words of lengths ranging from min_length to max_length.

    Parameters:
        dna_sequence (str): The DNA sequence to split (e.g., "ATGCGTAC").
        min_length (int): The minimum length of each word.
        max_length (int): The maximum length of each word.

    Returns:
        dict: A dictionary where keys are word lengths and values are lists of DNA words of that length.
    """
    if not dna_sequence:
        raise ValueError("The DNA sequence cannot be empty.")

    if min_length <= 0 or max_length <= 0:
        raise ValueError("Word lengths must be positive integers.")

    if min_length > max_length:
        raise ValueError("Minimum length cannot be greater than maximum length.")

    # Ensure the DNA sequence contains valid nucleotides
    for nucleotide in dna_sequence:
        if nucleotide.upper() not in "ATCG":
            raise ValueError(f"Invalid character '{nucleotide}' found in DNA sequence.")

    # Generate words of varying lengths
    words_by_length = {}
    for length in range(min_length, max_length + 1):
        words_by_length[length] = [dna_sequence[i:i+length] for i in range(0, len(dna_sequence) - length + 1)]

    return words_by_length

# Example usage
def main():
    dna_sequence = "ATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTA"
    min_length = 3
    max_length = 99

    try:
        words_by_length = split_dna_into_words(dna_sequence, min_length, max_length)
        for length, words in words_by_length.items():
            print(f"Words of length {length}:", words)
    except ValueError as e:
        print("Error:", e)

if __name__ == "__main__":
    main()

BOL: Python script to split a DNA sequence into words of varying lengths

Python script to split a DNA sequence into words of varying lengths