-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathrun.py
More file actions
148 lines (119 loc) · 4.95 KB
/
Copy pathrun.py
File metadata and controls
148 lines (119 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
ScoutBot main runner.
Usage:
python run.py # Full pipeline: scrape → cleanup closed → update sheet → email
python run.py --scrape # Only scrape (update sheet, no email)
python run.py --cleanup # Only remove closed opportunities from the sheet
python run.py --notify # Only send email (no scraping)
python run.py --schedule # Run on schedule: full pipeline at 7AM and 7PM daily
The full pipeline order is:
1. Scrape every source for new opportunities → adds new rows
2. Clean closed opportunities → removes expired rows
3. Send email digest → sends the live list
"""
import argparse
import logging
import os
import subprocess
import sys
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(os.path.join(os.path.dirname(__file__), "scoutbot.log")),
],
)
logger = logging.getLogger(__name__)
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SPIDERS = ["opportunities"]
def run_spider(spider_name):
logger.info(f"run.py: Starting spider '{spider_name}'...")
result = subprocess.run(
["scrapy", "crawl", spider_name, "--logfile", "scrapy.log"],
cwd=SCRIPT_DIR,
)
if result.returncode != 0:
logger.error(f"run.py: Spider '{spider_name}' exited with code {result.returncode}")
else:
logger.info(f"run.py: Spider '{spider_name}' done.")
def run_all_spiders():
for spider in SPIDERS:
run_spider(spider)
def run_cleanup():
"""Remove closed/expired opportunities from the Google Sheet."""
sys.path.insert(0, SCRIPT_DIR)
from cleanup import cleanup
cleanup()
def run_notify(dry_run=False):
"""Read the sheet and email the digest to all subscribers."""
sys.path.insert(0, SCRIPT_DIR)
from notify import run_notify as _run_notify
_run_notify(dry_run=dry_run)
def run_broadcast():
"""
Call broadcast.py (distribution-bridge) after scraping.
Reads the opportunities.json written by WhatsAppQueuePipeline and fans out
to every registered WhatsApp campus group via the Session Manager API.
Skips silently if broadcast.py is not present or SESSION_API_URL is unset.
"""
broadcast_script = os.path.join(SCRIPT_DIR, "distribution-bridge", "broadcast.py")
if not os.path.exists(broadcast_script):
logger.info("run.py: distribution-bridge/broadcast.py not found — skipping WhatsApp broadcast.")
return
session_url = os.getenv("SESSION_API_URL", "").strip()
if not session_url:
logger.info("run.py: SESSION_API_URL not set — skipping WhatsApp broadcast.")
return
logger.info("run.py: Starting WhatsApp broadcast...")
result = subprocess.run(
[sys.executable, broadcast_script, "--source", "json"],
cwd=os.path.join(SCRIPT_DIR, "distribution-bridge"),
)
if result.returncode != 0:
logger.warning(f"run.py: broadcast.py exited with code {result.returncode} — check distribution-bridge logs.")
else:
logger.info("run.py: WhatsApp broadcast complete.")
def full_pipeline():
logger.info("run.py: === Full pipeline START ===")
run_all_spiders()
run_cleanup()
run_broadcast()
run_notify(dry_run=False)
logger.info("run.py: === Full pipeline COMPLETE ===")
def run_schedule():
import schedule
import time
# Always schedule in UTC so the bot fires at 07:00 and 19:00 WAT
# regardless of the server's local timezone.
# WAT (West Africa Time) = UTC+1, so:
# 07:00 WAT = 06:00 UTC
# 19:00 WAT = 18:00 UTC
logger.info("run.py: Scheduler started. Will run at 06:00 UTC (07:00 WAT) and 18:00 UTC (19:00 WAT) daily.")
schedule.every().day.at("06:00").do(full_pipeline) # 07:00 Nigeria time
schedule.every().day.at("18:00").do(full_pipeline) # 19:00 Nigeria time
# Run immediately on startup so first results appear right away
full_pipeline()
while True:
schedule.run_pending()
time.sleep(60)
def main():
parser = argparse.ArgumentParser(description="ScoutBot")
parser.add_argument("--scrape", action="store_true", help="Only scrape (update sheet, no email)")
parser.add_argument("--cleanup", action="store_true", help="Only remove closed opportunities from the sheet")
parser.add_argument("--notify", action="store_true", help="Only send email")
parser.add_argument("--dry-run", action="store_true", help="Build email_preview.html without sending")
parser.add_argument("--schedule", action="store_true", help="Run on schedule (7AM + 7PM daily)")
args = parser.parse_args()
if args.scrape:
run_all_spiders()
elif args.cleanup:
run_cleanup()
elif args.notify or args.dry_run:
run_notify(dry_run=args.dry_run)
elif args.schedule:
run_schedule()
else:
full_pipeline()
if __name__ == "__main__":
main()