-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
155 lines (131 loc) · 4.63 KB
/
scrape.py
File metadata and controls
155 lines (131 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Fetch syzbot per-bug metadata (title, crash class, subsystems) into db.
Rate-limited, resumable. Re-run any time — only missing bug_ids are fetched.
"""
from __future__ import annotations
import re
import sys
import time
from dataclasses import dataclass
import requests
from .db import connect, init_db
BUG_URL = "https://syzkaller.appspot.com/bug?id={bug_id}"
TITLE_RE = re.compile(r"<title>(.*?)</title>", re.S)
STATUS_RE = re.compile(r"Status:\s*(?:<[^>]+>)?\s*([^<\n]+)", re.I)
SUBSYS_RE = re.compile(r'/upstream/s/([a-z0-9_\-]+)')
# Ordered: first match wins. Covers ~all syzbot crash classes.
CRASH_PREFIXES = [
"KASAN", "KMSAN", "KCSAN", "UBSAN", "KFENCE",
"WARNING", "BUG", "INFO",
"general protection fault",
"possible deadlock",
"inconsistent lock state",
"memory leak",
"unregister_netdevice",
"divide error",
"suppressed report",
"kernel BUG",
"stack segment",
"unable to handle",
"lost connection to test machine",
]
@dataclass
class BugMeta:
bug_id: str
title: str | None
crash_type: str | None
subsystem: str | None # first listed subsystem (primary)
status: str | None
raw_title: str | None
def _extract_crash_type(title: str) -> str | None:
t = title.strip()
for pref in CRASH_PREFIXES:
if t.lower().startswith(pref.lower()):
return pref
# fallbacks: take first word before a colon or space.
head = re.split(r"[:\s]", t, maxsplit=1)[0]
return head or None
def parse_bug_html(bug_id: str, html: str) -> BugMeta:
title = None
m = TITLE_RE.search(html)
if m:
title = re.sub(r"\s+", " ", m.group(1).strip())
status_m = STATUS_RE.search(html)
status = status_m.group(1).strip() if status_m else None
# Subsystem links appear in the "Subsystems:" panel and also in search links.
# Prefer the first hit after the "Subsystems:" label, else any.
sub_section = html
idx = html.find("Subsystems:")
if idx > 0:
# only look at the few hundred bytes immediately following the label
sub_section = html[idx:idx + 2000]
subs = SUBSYS_RE.findall(sub_section)
subsystem = subs[0] if subs else None
crash = _extract_crash_type(title) if title else None
return BugMeta(bug_id=bug_id, title=title, crash_type=crash,
subsystem=subsystem, status=status, raw_title=title)
def fetch_meta(session: requests.Session, bug_id: str, timeout: float = 30.0) -> BugMeta | None:
try:
r = session.get(BUG_URL.format(bug_id=bug_id), timeout=timeout)
except requests.RequestException as e:
print(f" fetch error {bug_id}: {e}")
return None
if r.status_code != 200:
print(f" http {r.status_code} for {bug_id}")
return None
return parse_bug_html(bug_id, r.text)
def scrape(limit: int | None = None, delay: float = 0.8) -> None:
init_db()
conn = connect()
cur = conn.cursor()
cur.execute(
"""
SELECT DISTINCT p.bug_id
FROM programs p
LEFT JOIN bugs b ON b.bug_id = p.bug_id
WHERE p.bug_id IS NOT NULL AND b.bug_id IS NULL
ORDER BY p.bug_id
"""
)
pending = [r[0] for r in cur.fetchall()]
if limit is not None:
pending = pending[:limit]
total = len(pending)
print(f"{total} bug_ids to fetch")
session = requests.Session()
session.headers.update({"User-Agent": "kmap/0.1 (corpus explorer)"})
last_t = 0.0
for i, bug_id in enumerate(pending, 1):
gap = delay - (time.time() - last_t)
if gap > 0:
time.sleep(gap)
meta = fetch_meta(session, bug_id)
last_t = time.time()
if meta is None:
continue
cur.execute(
"""
INSERT INTO bugs (bug_id, title, crash_type, subsystem, status, fetched_at, raw_title)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(bug_id) DO UPDATE SET
title=excluded.title,
crash_type=excluded.crash_type,
subsystem=excluded.subsystem,
status=excluded.status,
fetched_at=excluded.fetched_at,
raw_title=excluded.raw_title
""",
(meta.bug_id, meta.title, meta.crash_type, meta.subsystem,
meta.status, int(time.time()), meta.raw_title),
)
if i % 50 == 0:
conn.commit()
print(f" {i}/{total} fetched")
conn.commit()
conn.close()
print("scrape complete")
if __name__ == "__main__":
limit = None
for arg in sys.argv[1:]:
if arg.startswith("--limit="):
limit = int(arg.split("=", 1)[1])
scrape(limit=limit)