Skip to content

text

detect_consecutive_repetition(text, min_len=3, threshold=10)

Detect consecutive repeated content in the text. :param text: Input text :param min_len: Minimum length of the repeated chunk (in words) :return: The repeated chunk and its repeat count

Source code in rm_gallery/core/utils/text.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def detect_consecutive_repetition(text, min_len=3, threshold=10):
    """
    Detect consecutive repeated content in the text.
    :param text: Input text
    :param min_len: Minimum length of the repeated chunk (in words)
    :return: The repeated chunk and its repeat count
    """
    # Split the text into words
    words = text.split()
    # Try different chunk lengths from min_len up to min_len+5
    for n in range(min_len, min(min_len + 5, len(words) // 2 + 1)):
        # Slide a window of size n*2 over the words
        for i in range(len(words) - n * 2 + 1):
            chunk = words[i : i + n]
            next_chunk = words[i + n : i + 2 * n]
            # Check if the current chunk is repeated immediately after itself
            if chunk == next_chunk:
                # Count how many times the chunk is repeated consecutively
                count = 2
                while (
                    i + (count + 1) * n <= len(words)
                    and words[i : i + n] == words[i + n * count : i + n * (count + 1)]
                ):
                    count += 1
                if count > threshold:
                    phrase = " ".join(chunk)
                    return {phrase: count}

    return {}