Skip to content

Commit e3df7cc

Browse files
committed
Improve Boyer–Moore search: fix shifting, add bad-char table, doctests
1 parent a71618f commit e3df7cc

1 file changed

Lines changed: 71 additions & 80 deletions

File tree

strings/boyer_moore_search.py

Lines changed: 71 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,92 @@
11
"""
2-
The algorithm finds the pattern in given text using following rule.
2+
Boyer-Moore string search (bad-character rule) - improved and compatible.
33
4-
The bad-character rule considers the mismatched character in Text.
5-
The next occurrence of that character to the left in Pattern is found,
4+
This module provides both a function API `boyer_moore_search(text, pattern)`
5+
and a class `BoyerMooreSearch` with method `bad_character_heuristic()` so it
6+
remains compatible with existing usage while improving correctness, clarity,
7+
and performance.
68
7-
If the mismatched character occurs to the left in Pattern,
8-
a shift is proposed that aligns text block and pattern.
9+
- Precomputes the bad-character table for O(1) lookup per mismatch.
10+
- Uses a while-loop with proper shifting logic.
11+
- Handles edge-cases (empty pattern matches at all positions).
12+
- Includes doctests for typical cases and edge cases.
913
10-
If the mismatched character does not occur to the left in Pattern,
11-
a shift is proposed that moves the entirety of Pattern past
12-
the point of mismatch in the text.
13-
14-
If there is no mismatch then the pattern matches with text block.
15-
16-
Time Complexity : O(n/m)
17-
n=length of main string
18-
m=length of pattern string
14+
Author: your-github-username
15+
License: MIT
1916
"""
2017

18+
from __future__ import annotations
2119

22-
class BoyerMooreSearch:
23-
"""
24-
Example usage:
25-
26-
bms = BoyerMooreSearch(text="ABAABA", pattern="AB")
27-
positions = bms.bad_character_heuristic()
28-
29-
where 'positions' contain the locations where the pattern was matched.
30-
"""
31-
32-
def __init__(self, text: str, pattern: str):
33-
self.text, self.pattern = text, pattern
34-
self.textLen, self.patLen = len(text), len(pattern)
35-
36-
def match_in_pattern(self, char: str) -> int:
37-
"""
38-
Finds the index of char in pattern in reverse order.
20+
# We intentionally use built-in `dict` and `list` annotations (PEP 585).
3921

40-
Parameters :
41-
char (chr): character to be searched
4222

43-
Returns :
44-
i (int): index of char from last in pattern
45-
-1 (int): if char is not found in pattern
23+
def _build_bad_char_table(pattern: str) -> dict[str, int]:
24+
"""Build mapping char -> last index in pattern."""
25+
table: dict[str, int] = {}
26+
for i, ch in enumerate(pattern):
27+
table[ch] = i
28+
return table
4629

47-
>>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB")
48-
>>> bms.match_in_pattern("B")
49-
1
50-
"""
5130

52-
for i in range(self.patLen - 1, -1, -1):
53-
if char == self.pattern[i]:
54-
return i
55-
return -1
31+
def boyer_moore_search(text: str, pattern: str) -> list[int]:
32+
"""Return list of start indices where pattern occurs in text.
5633
57-
def mismatch_in_text(self, current_pos: int) -> int:
58-
"""
59-
Find the index of mis-matched character in text when compared with pattern
60-
from last.
34+
>>> _build_bad_char_table("abcab")
35+
{'a': 3, 'b': 4, 'c': 2}
36+
>>> boyer_moore_search("abacaabaccabacaba", "aba")
37+
[0, 5, 10, 14]
38+
>>> boyer_moore_search("aaaaa", "aa")
39+
[0, 1, 2, 3]
40+
>>> boyer_moore_search("hello", "world")
41+
[]
42+
>>> boyer_moore_search("", "")
43+
[0]
44+
>>> boyer_moore_search("abc", "")
45+
[0, 1, 2, 3]
46+
>>> boyer_moore_search("", "a")
47+
[]
48+
"""
49+
if pattern == "":
50+
return list(range(len(text) + 1))
51+
if text == "" or len(pattern) > len(text):
52+
return []
53+
54+
n, m = len(text), len(pattern)
55+
bad = _build_bad_char_table(pattern)
56+
57+
results: list[int] = []
58+
s = 0 # shift of the pattern with respect to text
59+
while s <= n - m:
60+
j = m - 1
61+
while j >= 0 and pattern[j] == text[s + j]:
62+
j -= 1
63+
if j < 0:
64+
results.append(s)
65+
# allow overlapping matches: shift by 1 to check next possible start
66+
s += 1
67+
else:
68+
last = bad.get(text[s + j], -1)
69+
shift = j - last
70+
s += shift if shift > 0 else 1
71+
return results
6172

62-
Parameters :
63-
current_pos (int): current index position of text
6473

65-
Returns :
66-
i (int): index of mismatched char from last in text
67-
-1 (int): if there is no mismatch between pattern and text block
74+
class BoyerMooreSearch:
75+
"""Compatibility wrapper class around boyer_moore_search.
6876
69-
>>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB")
70-
>>> bms.mismatch_in_text(2)
71-
3
72-
"""
77+
Example:
78+
>>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB")
79+
>>> bms.bad_character_heuristic()
80+
[0, 3]
81+
"""
7382

74-
for i in range(self.patLen - 1, -1, -1):
75-
if self.pattern[i] != self.text[current_pos + i]:
76-
return current_pos + i
77-
return -1
83+
def __init__(self, text: str, pattern: str) -> None:
84+
self.text = text
85+
self.pattern = pattern
7886

7987
def bad_character_heuristic(self) -> list[int]:
80-
"""
81-
Finds the positions of the pattern location.
82-
83-
>>> bms = BoyerMooreSearch(text="ABAABA", pattern="AB")
84-
>>> bms.bad_character_heuristic()
85-
[0, 3]
86-
"""
87-
88-
positions = []
89-
for i in range(self.textLen - self.patLen + 1):
90-
mismatch_index = self.mismatch_in_text(i)
91-
if mismatch_index == -1:
92-
positions.append(i)
93-
else:
94-
match_index = self.match_in_pattern(self.text[mismatch_index])
95-
i = (
96-
mismatch_index - match_index
97-
) # shifting index lgtm [py/multiple-definition]
98-
return positions
88+
"""Return match positions using bad-character heuristic."""
89+
return boyer_moore_search(self.text, self.pattern)
9990

10091

10192
if __name__ == "__main__":

0 commit comments

Comments
 (0)