-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmining_script.py
More file actions
133 lines (112 loc) · 4.76 KB
/
mining_script.py
File metadata and controls
133 lines (112 loc) · 4.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import tweepy
from tweepy import OAuthHandler
import json
import datetime as dt
import time
import os
import sys
def load_api():
consumer_key = "****"
consumer_secret = "****"
access_token = "****"
access_secret = "****"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
return tweepy.API(auth)
def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
searched_tweets = []
while len(searched_tweets) < max_tweets:
remaining_tweets = max_tweets - len(searched_tweets)
try:
new_tweets = api.search(q=query, count=remaining_tweets,
since_id=str(since_id),
max_id=str(max_id-1))
print('found',len(new_tweets),'tweets')
if not new_tweets:
print('no tweets found')
break
searched_tweets.extend(new_tweets)
max_id = new_tweets[-1].id
except tweepy.TweepError:
print('exception raised, waiting 15 minutes')
print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
time.sleep(15*60)
break
return searched_tweets, max_id
def get_tweet_id(api, date='', days_ago=9, query='a'):
if date:
td = date + dt.timedelta(days=1)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
tweet = api.search(q=query, count=1, until=tweet_date)
else:
td = dt.datetime.now() - dt.timedelta(days=days_ago)
tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
tweet = api.search(q=query, count=10, until=tweet_date)
print('search limit (start/stop):',tweet[0].created_at)
return tweet[0].id
def write_tweets(tweets, filename):
with open(filename, 'a') as f:
for tweet in tweets:
if(tweet.lang=="en"):
json.dump(tweet._json, f)
f.write('\n')
def main():
search_phrases = ['ngo']
time_limit = 10
max_tweets = 100
min_days_old, max_days_old = 6, 7
USA = '39.8,-95.583068847656,2500km'
for search_phrase in search_phrases:
print('Search phrase =', search_phrase)
name = search_phrase.split()[0]
json_file_root = name + '/' + name
os.makedirs(os.path.dirname(json_file_root), exist_ok=True)
read_IDs = False
if max_days_old - min_days_old == 1:
d = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
else:
d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
json_file = json_file_root + '_' + day + '.json'
if os.path.isfile(json_file):
print('Appending tweets to file named: ',json_file)
read_IDs = True
api = load_api()
if read_IDs:
with open(json_file, 'r') as f:
lines = f.readlines()
max_id = json.loads(lines[-1])['id']
print('Searching from the bottom ID in file')
else:
if min_days_old == 0:
max_id = -1
else:
max_id = get_tweet_id(api, days_ago=(min_days_old-1))
since_id = get_tweet_id(api, days_ago=(max_days_old-1))
print('max id (starting point) =', max_id)
print('since id (ending point) =', since_id)
start = dt.datetime.now()
end = start + dt.timedelta(hours=time_limit)
count, exitcount = 0, 0
while dt.datetime.now() < end:
count += 1
print('count =',count)
tweets, max_id = tweet_search(api, search_phrase, max_tweets,
max_id=max_id, since_id=since_id,
geocode=USA)
if tweets:
write_tweets(tweets, json_file)
exitcount = 0
else:
exitcount += 1
if exitcount == 3:
if search_phrase == search_phrases[-1]:
sys.exit('Maximum number of empty tweet strings reached - exiting')
else:
print('Maximum number of empty tweet strings reached - breaking')
break
if __name__ == "__main__":
main()