from collections import defaultdict
def find_repeats_in_genome(genome, min_length=2, max_length=None):
"""
Finds all repeating sequences in a genome within a specified length range.
Parameters:
genome (str): The genome sequence.
min_length (int): Minimum length of repeats to scan for (default: 2).
max_length (int): Maximum length of repeats to scan for (default: None, meaning entire genome).
Returns:
dict: A dictionary where keys are repeating sequences and values are lists of starting positions.
"""
if max_length is None:
max_length = len(genome)
repeats = defaultdict(list)
# Iterate over all possible lengths of substrings
for length in range(min_length, max_length + 1):
seen = defaultdict(list) # Tracks occurrences of substrings of the current length
# Sliding window approach
for i in range(len(genome) - length + 1):
substring = genome[i:i + length]
seen[substring].append(i)
# Filter substrings that appear more than once
for substring, positions in seen.items():
if len(positions) > 1:
repeats[substring].extend(positions)
return repeats
# Example usage
def main():
genome = "ATCGATCGAATTCGATCG" # Example genome sequence
min_length = 2
max_length = 5
repeats = find_repeats_in_genome(genome, min_length, max_length)
print("Repeating sequences:")
for seq, positions in repeats.items():
print(f"Sequence: {seq}, Positions: {positions}")
if __name__ == "__main__":
main()