BOL: Python script to find all possible repeats in a DNA string !

BioScripts
LEGE
Python script to find all possible repeats in a DNA string !

Python script to find all possible repeats in a DNA string !

By LEGE 131 days ago

from collections import defaultdict

def find_repeats_in_genome(genome, min_length=2, max_length=None):
    """
    Finds all repeating sequences in a genome within a specified length range.

    Parameters:
        genome (str): The genome sequence.
        min_length (int): Minimum length of repeats to scan for (default: 2).
        max_length (int): Maximum length of repeats to scan for (default: None, meaning entire genome).

    Returns:
        dict: A dictionary where keys are repeating sequences and values are lists of starting positions.
    """
    if max_length is None:
        max_length = len(genome)

    repeats = defaultdict(list)

    # Iterate over all possible lengths of substrings
    for length in range(min_length, max_length + 1):
        seen = defaultdict(list)  # Tracks occurrences of substrings of the current length

        # Sliding window approach
        for i in range(len(genome) - length + 1):
            substring = genome[i:i + length]
            seen[substring].append(i)

        # Filter substrings that appear more than once
        for substring, positions in seen.items():
            if len(positions) > 1:
                repeats[substring].extend(positions)

    return repeats

# Example usage
def main():
    genome = "ATCGATCGAATTCGATCG"  # Example genome sequence
    min_length = 2
    max_length = 5

    repeats = find_repeats_in_genome(genome, min_length, max_length)

    print("Repeating sequences:")
    for seq, positions in repeats.items():
        print(f"Sequence: {seq}, Positions: {positions}")

if __name__ == "__main__":
    main()

BOL

LEGE

Our Sponsors

Python script to find all possible repeats in a DNA string !