关于vb.net:在字符串中查找重复序列

Find a repetitive sequence in a string

我想在VB.Net的字符串中找到重复的序列,例如:

Dim test as String =" EDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGB"

我希望程序检测重复的序列,以防EDCRFVTGB并计算重复的次数。 我的问题是在字符串中找到重复序列,我搜索了几种方法来做,但没有找到解决方案,我尝试了快速排序算法,重复算法,但是其中一些不适用于字符串。

我虽然要创建子字符串并检查它们在字符串中的存在,但是我不知道如何获取子字符串,因为字符串上没有模式,所以字符串中也可能没有重复的序列。


这是一个示例,允许您指定序列的最小和最大长度,并返回一个自定义类的列表,该自定义类称为sequence,其中存在多个实例。序列类将包含找到的模式和该模式发生的索引列表。

enter image description here

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
Option Strict On
Option Explicit On
Option Infer Off
Public Class Form1
    Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click

        ListView1.Items.Clear()
        ListView1.Columns.Clear()
        ListView1.Columns.Add("Sequence")
        ListView1.Columns.Add("Indexes of occurrence")
        Dim sequences As List(Of Sequence) = DetectSequences("EDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGB")
        For Each s As Sequence In sequences
            Dim item As New ListViewItem(s.Sequence)
            item.Tag = s
            item.SubItems.Add(s.IndexesToString)
            ListView1.Items.Add(item)
        Next
        ListView1.AutoResizeColumns(ColumnHeaderAutoResizeStyle.HeaderSize)
    End Sub
    Function DetectSequences(s As String, Optional minLength As Integer = 5, Optional MaxLength As Integer = 8) As List(Of Sequence)
        Dim foundPatterns As New List(Of String)
        Dim foundSequences As New List(Of Sequence)
        Dim potentialPattern As String = String.Empty, potentialMatch As String = String.Empty
        For start As Integer = 0 To s.Length - 1
            For length As Integer = 1 To s.Length - start
                potentialPattern = s.Substring(start, length)
                If potentialPattern.Length < minLength Then Continue For
                If potentialPattern.Length > MaxLength Then Continue For
                If foundPatterns.IndexOf(potentialPattern) = -1 Then
                    foundPatterns.Add(potentialPattern)
                End If
            Next
        Next
        For Each pattern As String In foundPatterns
            Dim sequence As New Sequence With {.Sequence = pattern}
            For start As Integer = 0 To s.Length - pattern.Length
                Dim length As Integer = pattern.Length
                potentialMatch = s.Substring(start, length)
                If potentialMatch = pattern Then
                    sequence.Indexes.Add(start)
                End If
            Next
            If sequence.Indexes.Count > 1 Then foundSequences.Add(sequence)
        Next
        Return foundSequences
    End Function
    Public Class Sequence
        Public Sequence As String =""
        Public Indexes As New List(Of Integer)
        Public Function IndexesToString() As String
            Dim sb As New System.Text.StringBuilder
            For i As Integer = 0 To Indexes.Count - 1
                If i = Indexes.Count - 1 Then
                    sb.Append(Indexes(i).ToString)
                Else
                    sb.Append(Indexes(i).ToString &",")
                End If
            Next
            Return sb.ToString
        End Function
    End Class
    Private Sub ListView1_SelectedIndexChanged(sender As Object, e As EventArgs) Handles ListView1.SelectedIndexChanged
        If ListView1.SelectedItems.Count = 0 Then Exit Sub
        RichTextBox1.Clear()
        RichTextBox1.Text ="EDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGBEDCRFVTGB"
        Dim selectedSequence As Sequence = DirectCast(ListView1.SelectedItems(0).Tag, Sequence)
        For Each i As Integer In selectedSequence.Indexes
            RichTextBox1.SelectionStart = i
            RichTextBox1.SelectionLength = selectedSequence.Sequence.Length
            RichTextBox1.SelectionBackColor = Color.Red
        Next
    End Sub
End Class

首先检查目标字符串的一半是否重复两次。如果不是,请检查字符串的三分之一是否重复三次。如果不是,请检查字符串的四分之一是否重复四次。这样做直到找到匹配的序列。跳过商不是整数的除数,以使其表现更好。这段代码可以解决问题,并填补该描述未能阐明的任何空白:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
Public Function DetermineSequence(ByVal strTarget As String) As String

    Dim strSequence As String = String.Empty

    Dim intLengthOfTarget As Integer = strTarget.Length

    'Check for a valid Target string.
    If intLengthOfTarget > 2 Then

        'Try 1/2 of Target, 1/3 of Target, 1/4 of Target, etc until sequence is found.
        Dim intCursor As Integer = 2

        Do Until strSequence.Length > 0 OrElse intCursor = intLengthOfTarget

            'Don't even test the string if its length is not a divisor (to an Integer) of the length of the target String.
            If IsDividendDivisibleByDivisor(strTarget.Length, intCursor) Then

                'Get the possible sequence.
                Dim strPossibleSequence As String = strTarget.Substring(0, (intLengthOfTarget / intCursor))

                'See if this possible sequence actually is the repeated String.
                If IsPossibleSequenceRepeatedThroughoutTarget(strPossibleSequence, strTarget) Then

                    'The repeated sequence has been found.
                    strSequence = strPossibleSequence

                End If

            End If

            intCursor += 1

        Loop

    End If

    Return strSequence

End Function

Private Function IsDividendDivisibleByDivisor(ByVal intDividend As Integer, ByVal intDivisor As Integer) As Boolean

    Dim bolDividendIsDivisbleByDivisor As Boolean = False

    Dim intOutput As Integer

    If Integer.TryParse((intDividend / intDivisor), intOutput) Then

        bolDividendIsDivisbleByDivisor = True

    End If

    Return bolDividendIsDivisbleByDivisor

End Function

Private Function IsPossibleSequenceRepeatedThroughoutTarget(ByVal strPossibleSequence As String, ByVal strTarget As String) As Boolean

    Dim bolPossibleSequenceIsRepeatedThroughoutTarget As Boolean = False

    Dim intLengthOfTarget As Integer = strTarget.Length
    Dim intLengthOfPossibleSequence As Integer = strPossibleSequence.Length

    Dim bolIndicatorThatPossibleSequenceIsCertainlyNotRepeated As Boolean = False

    Dim intCursor As Integer = 1

    Do Until (intCursor * intLengthOfPossibleSequence) = strTarget.Length OrElse bolIndicatorThatPossibleSequenceIsCertainlyNotRepeated

        If strTarget.Substring((intCursor * intLengthOfPossibleSequence), intLengthOfPossibleSequence) <> strPossibleSequence Then

            bolIndicatorThatPossibleSequenceIsCertainlyNotRepeated = True

        End If

        intCursor += 1

    Loop

    If Not bolIndicatorThatPossibleSequenceIsCertainlyNotRepeated Then

        bolPossibleSequenceIsRepeatedThroughoutTarget = True

    End If

    Return bolPossibleSequenceIsRepeatedThroughoutTarget

End Function


你知道字符串从哪里开始吗?
你知道多久了吗?

简单的算法是:

1
2
3
4
for each character index i
    for each character index after that j
        compare substring(i, j-i) to substring(j, j-i)
        if equal, record as a found repeating substring

有一些优化,例如知道字符串不能超出字符串的末尾(j的上限),并且仅查找比您发现的更长的子字符串。

这不是超级有效的(N平方),但是也已知一个相关的广义问题("编辑距离")并不比N平方好,所以就去吧。


这是一种算法,该算法以按长度和首次出现顺序对其进行排序的方式增量生成所有重复序列。它基于一个简单的想法:要在一个句子中找到一个单词两次,同一起始字母必须出现两次。

带有一些解释的Java代码(算法保持不变),它将输出交织的重复代码,例如BANANA => A,N,AN,NA,ANA(1,3),如果到前一个的距离小于字符串长度,则可以消除索引,以在此算法中对其进行纠正(下面的代码为示例运行) ,这应该可以更好地解释它):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
public List<String> getRepetitions(String string) {
   List<String> repetitions = new ArrayList<String>();
   Map<String, List<Integer>> rep = new HashMap<String, List<Integer>>(), repOld;
   // init rep, add start position of all single character length strings
   for (int i = 0; i < string.length(); i++) {
      String s = string.substring(i, i + 1); // startIndex inclusive, endIndex exclusive
      if (rep.containsKey(s)) {
         rep.get(s).add(new Integer(i));
      } else {
         List<Integer> l = new ArrayList<Integer>();
         l.add(new Integer(i));
         rep.put(l);
      }
   }
   // eliminate those with no repetitions and add the others to the solution
   for (Map.Entry<String, Integer> e : rep.entrySet()) {
      if (e.getValue().size() < 2) {
         rep.remove(e.getKey());
      } else {
         repetitions.add(e.getKey());
      }
   }
   for (int len = 1; rep.size() > 0; len++) {
      repOld = rep;
      rep = new HashMap<String, List<Integer>>();
      for (Map.EntrySet<String, List<Integer>> e : repOld.entrySet()) {
         for (Integer i : e.getValue()) { // for all start indices
            if (i.intValue() + len + 1 >= string.length())
               break;
            String s = e.getKey() + string.charAt(i.intValue() + len + 1);
            if (rep.containsKey(s)) {
               rep.get(s).add(i);
            } else {
               List<Integer> l = new ArrayList<Integer>();
               l.add(i);
               rep.put(l);
            }
         }
      }
      // eliminate repetitions and add to solution
      for (Map.Entry<String, Integer> e : rep.entrySet()) {
         if (e.getValue().size() < 2) {
            rep.remove(e.getKey());
         } else {
            repetitions.add(e.getKey());
         }
      }
   }
   return repetitions; // ordered by length, so last = longest
}

BANANA的样品运行:

  • 将单个字母添加到rep => B-> [0],A-> [1、3、5],N-> [2、4]
  • 消除出现次数少于2的那些事件(B),将其他事件添加到解决方案中(A,N)
  • 将以下字母添加到其余出现的位置(创建新的rep):AN-> [1,3],NA-> [2,4]
  • 消除(-)并添加(AN,NA)
  • 重复步骤3和4.:ANA-> [1,3]
  • rep循环中的循环将变空并且算法完成