|
1 | 1 | """ |
2 | | -The algorithm finds the pattern in given text using following rule. |
| 2 | +Boyer-Moore string search (bad-character rule) - improved and compatible. |
3 | 3 |
|
4 | | -The bad-character rule considers the mismatched character in Text. |
5 | | -The next occurrence of that character to the left in Pattern is found, |
| 4 | +This module provides both a function API `boyer_moore_search(text, pattern)` |
| 5 | +and a class `BoyerMooreSearch` with method `bad_character_heuristic()` so it |
| 6 | +remains compatible with existing usage while improving correctness, clarity, |
| 7 | +and performance. |
6 | 8 |
|
7 | | -If the mismatched character occurs to the left in Pattern, |
8 | | -a shift is proposed that aligns text block and pattern. |
| 9 | +- Precomputes the bad-character table for O(1) lookup per mismatch. |
| 10 | +- Uses a while-loop with proper shifting logic. |
| 11 | +- Handles edge-cases (empty pattern matches at all positions). |
| 12 | +- Includes doctests for typical cases and edge cases. |
9 | 13 |
|
10 | | -If the mismatched character does not occur to the left in Pattern, |
11 | | -a shift is proposed that moves the entirety of Pattern past |
12 | | -the point of mismatch in the text. |
13 | | -
|
14 | | -If there is no mismatch then the pattern matches with text block. |
15 | | -
|
16 | | -Time Complexity : O(n/m) |
17 | | - n=length of main string |
18 | | - m=length of pattern string |
| 14 | +Author: your-github-username |
| 15 | +License: MIT |
19 | 16 | """ |
20 | 17 |
|
| 18 | +from __future__ import annotations |
21 | 19 |
|
22 | | -class BoyerMooreSearch: |
23 | | - """ |
24 | | - Example usage: |
25 | | -
|
26 | | - bms = BoyerMooreSearch(text="ABAABA", pattern="AB") |
27 | | - positions = bms.bad_character_heuristic() |
28 | | -
|
29 | | - where 'positions' contain the locations where the pattern was matched. |
30 | | - """ |
31 | | - |
32 | | - def __init__(self, text: str, pattern: str): |
33 | | - self.text, self.pattern = text, pattern |
34 | | - self.textLen, self.patLen = len(text), len(pattern) |
35 | | - |
36 | | - def match_in_pattern(self, char: str) -> int: |
37 | | - """ |
38 | | - Finds the index of char in pattern in reverse order. |
| 20 | +# We intentionally use built-in `dict` and `list` annotations (PEP 585). |
39 | 21 |
|
40 | | - Parameters : |
41 | | - char (chr): character to be searched |
42 | 22 |
|
43 | | - Returns : |
44 | | - i (int): index of char from last in pattern |
45 | | - -1 (int): if char is not found in pattern |
| 23 | +def _build_bad_char_table(pattern: str) -> dict[str, int]: |
| 24 | + """Build mapping char -> last index in pattern.""" |
| 25 | + table: dict[str, int] = {} |
| 26 | + for i, ch in enumerate(pattern): |
| 27 | + table[ch] = i |
| 28 | + return table |
46 | 29 |
|
47 | | - >>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB") |
48 | | - >>> bms.match_in_pattern("B") |
49 | | - 1 |
50 | | - """ |
51 | 30 |
|
52 | | - for i in range(self.patLen - 1, -1, -1): |
53 | | - if char == self.pattern[i]: |
54 | | - return i |
55 | | - return -1 |
| 31 | +def boyer_moore_search(text: str, pattern: str) -> list[int]: |
| 32 | + """Return list of start indices where pattern occurs in text. |
56 | 33 |
|
57 | | - def mismatch_in_text(self, current_pos: int) -> int: |
58 | | - """ |
59 | | - Find the index of mis-matched character in text when compared with pattern |
60 | | - from last. |
| 34 | + >>> _build_bad_char_table("abcab") |
| 35 | + {'a': 3, 'b': 4, 'c': 2} |
| 36 | + >>> boyer_moore_search("abacaabaccabacaba", "aba") |
| 37 | + [0, 5, 10, 14] |
| 38 | + >>> boyer_moore_search("aaaaa", "aa") |
| 39 | + [0, 1, 2, 3] |
| 40 | + >>> boyer_moore_search("hello", "world") |
| 41 | + [] |
| 42 | + >>> boyer_moore_search("", "") |
| 43 | + [0] |
| 44 | + >>> boyer_moore_search("abc", "") |
| 45 | + [0, 1, 2, 3] |
| 46 | + >>> boyer_moore_search("", "a") |
| 47 | + [] |
| 48 | + """ |
| 49 | + if pattern == "": |
| 50 | + return list(range(len(text) + 1)) |
| 51 | + if text == "" or len(pattern) > len(text): |
| 52 | + return [] |
| 53 | + |
| 54 | + n, m = len(text), len(pattern) |
| 55 | + bad = _build_bad_char_table(pattern) |
| 56 | + |
| 57 | + results: list[int] = [] |
| 58 | + s = 0 # shift of the pattern with respect to text |
| 59 | + while s <= n - m: |
| 60 | + j = m - 1 |
| 61 | + while j >= 0 and pattern[j] == text[s + j]: |
| 62 | + j -= 1 |
| 63 | + if j < 0: |
| 64 | + results.append(s) |
| 65 | + # allow overlapping matches: shift by 1 to check next possible start |
| 66 | + s += 1 |
| 67 | + else: |
| 68 | + last = bad.get(text[s + j], -1) |
| 69 | + shift = j - last |
| 70 | + s += shift if shift > 0 else 1 |
| 71 | + return results |
61 | 72 |
|
62 | | - Parameters : |
63 | | - current_pos (int): current index position of text |
64 | 73 |
|
65 | | - Returns : |
66 | | - i (int): index of mismatched char from last in text |
67 | | - -1 (int): if there is no mismatch between pattern and text block |
| 74 | +class BoyerMooreSearch: |
| 75 | + """Compatibility wrapper class around boyer_moore_search. |
68 | 76 |
|
69 | | - >>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB") |
70 | | - >>> bms.mismatch_in_text(2) |
71 | | - 3 |
72 | | - """ |
| 77 | + Example: |
| 78 | + >>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB") |
| 79 | + >>> bms.bad_character_heuristic() |
| 80 | + [0, 3] |
| 81 | + """ |
73 | 82 |
|
74 | | - for i in range(self.patLen - 1, -1, -1): |
75 | | - if self.pattern[i] != self.text[current_pos + i]: |
76 | | - return current_pos + i |
77 | | - return -1 |
| 83 | + def __init__(self, text: str, pattern: str) -> None: |
| 84 | + self.text = text |
| 85 | + self.pattern = pattern |
78 | 86 |
|
79 | 87 | def bad_character_heuristic(self) -> list[int]: |
80 | | - """ |
81 | | - Finds the positions of the pattern location. |
82 | | -
|
83 | | - >>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB") |
84 | | - >>> bms.bad_character_heuristic() |
85 | | - [0, 3] |
86 | | - """ |
87 | | - |
88 | | - positions = [] |
89 | | - for i in range(self.textLen - self.patLen + 1): |
90 | | - mismatch_index = self.mismatch_in_text(i) |
91 | | - if mismatch_index == -1: |
92 | | - positions.append(i) |
93 | | - else: |
94 | | - match_index = self.match_in_pattern(self.text[mismatch_index]) |
95 | | - i = ( |
96 | | - mismatch_index - match_index |
97 | | - ) # shifting index lgtm [py/multiple-definition] |
98 | | - return positions |
| 88 | + """Return match positions using bad-character heuristic.""" |
| 89 | + return boyer_moore_search(self.text, self.pattern) |
99 | 90 |
|
100 | 91 |
|
101 | 92 | if __name__ == "__main__": |
|
0 commit comments