From cd51e48906f041f3cb13352b99a33d993c885688 Mon Sep 17 00:00:00 2001 From: Jeremy Foote Date: Mon, 13 Apr 2020 12:28:02 -0400 Subject: [PATCH 1/3] Initial attempt to get wayback urls into SERP data. --- search_engine_results/.gitignore | 2 + search_engine_results/add_wayback_urls.py | 464 ++++++++++++++++++++++ 2 files changed, 466 insertions(+) create mode 100644 search_engine_results/.gitignore create mode 100644 search_engine_results/add_wayback_urls.py diff --git a/search_engine_results/.gitignore b/search_engine_results/.gitignore new file mode 100644 index 0000000..382ddaa --- /dev/null +++ b/search_engine_results/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +config.py diff --git a/search_engine_results/add_wayback_urls.py b/search_engine_results/add_wayback_urls.py new file mode 100644 index 0000000..cf8fef3 --- /dev/null +++ b/search_engine_results/add_wayback_urls.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import pprint +import json +import re +import requests +import time +import config +import argparse +from collections import Counter +import logging +import urllib.parse +import csv + + +ENDPT = 'https://web.archive.org/save/' +UA_STRING = config.UA_STRING +ACCESS_KEY = config.ACCESS_KEY +SECRET_KEY = config.SECRET_KEY +HEADERS = {'Accept':'application/json', + 'User-Agent': UA_STRING, + 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'} +IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one + + +######## +# The goal of this program is to take all of the URLs from SERPs and archive them in the Wayback Machine, +# and then store the Wayback URLs as part of the SERP metadata. +# +# There is a lot of overlap in URLs so we store previous results in temporary files so we don't repeat the same calls +# +# Example usage: +# python3 add_wayback_urls.py -i /path/to/serps/dir -t ./tmp --ignore_self_links # Note that this would overwrite the json files in the /path/to/serps/dir directories +######## + + +def main(): + parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.') + parser.add_argument('-i', help='Input directory with metadata files') + parser.add_argument('-o', help='Location to save modified files (if blank, will overwrite)') + ## TODO: Maybe switch this so default is to ignore? + parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query', + action='store_true') + parser.add_argument('-t', help='Temp directory location (to save/load job ids, outlinks, and wayback URLs)') + + args = parser.parse_args() + + + # Make a list of the files that we are going to be editing (skip those already edited) + files = pathlib.Path(args.i).glob('**/*.json') + ## FOR TESTING ONLY!!! + #files = list(files)[10:11] + incomplete_files = list(files) + while len(incomplete_files) > 0: + for fn in incomplete_files: + try: + add_wayback_urls(fn, args.o, args.t, args.ignore_self_links) + incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list + except ConnectionError: + failed_files.append(fn) + + +def add_wayback_urls(filename, out_dir, temp_dir, ignore_self_links = False, remove_cache = True): + + def output_file_exists(f): + '''Says whether the output file for a given file exists, so that we can skip it if it's done. + Assumes that if the input file and the output file are the same, we want to overwrite the input file. + We therefore return False in this case''' + out_path = get_out_path(f, out_dir) + if out_path == f: + return False + return out_path.exists() + + + def write_wayback_to_file(filename, temp_file): + url_to_wb = {} + with open(filename, 'r') as f: + with open(temp_file, 'a') as tf: + tf_csv = csv.writer(tf) + j_obj = json.load(f) + query_url = j_obj['link'] + try: + wayback_url = wayback_dict[query_url].get_wayback_url() + j_obj['wayback_url'] = wayback_url + tf_csv.writerow([query_url, wayback_url]) + except KeyError: + logging.error(f"Should have an entry for {query_url}") + logging.error(wayback_dict.keys()) + j_obj['wayback_url'] = '' + for link_obj in j_obj['linkElements']: + link_url = link_obj['href'] + try: + wayback_url = wayback_dict[link_url].get_wayback_url() + link_obj['wayback_url'] = wayback_url + tf_csv.writerow([link_url, wayback_url]) + except KeyError: + if link_url in urls_to_archive: + # If it's in the urls to archive, then it should be in the dictionary. + logging.error(f"Should have an entry for {link_url}") + logging.error(wayback_dict.keys()) + logging.error(link_obj['href']) + link_obj['wayback_url'] = '' + outfile = get_out_path(filename, out_dir) + with open(outfile, 'w') as f: + json.dump(j_obj, f) + + tmp_dir = pathlib.Path(temp_dir) + if not tmp_dir.exists(): + tmp_dir.mkdir() + + # Query urls are the SERP query URLs; we get all of the outgoing links for these, to + # hopefully avoid duplication and having too many active jobs + query_urls =[] + urls_to_archive = [] + urls_to_skip = get_skipped_urls(temp_dir) # Skip these unless they appear > once + # First read the files and create a list of URLs to archive + with open(filename, 'r') as f: + j_obj = json.load(f) + # If this file already has wayback info, or if the output file has been created, + # the skip it + if output_file_exists(filename) or 'wayback_url' in j_obj: + return None + query_url, other_urls = get_urls_from_json(j_obj) + query_urls.append(query_url) + # Remove self links + if ignore_self_links: + domain = get_domain(query_url) + else: + domain = None + to_archive, to_skip = filter_urls(other_urls, domain, remove_cache) + print(to_archive) + print(to_skip) + urls_to_archive += to_archive + urls_to_skip += to_skip + + # For the URLs that we would otherwise skip, grab them if they occur + # more than once. Write the rest back to the temp file to check next time + with open(temp_dir + '/skipped_urls.csv', 'w') as tf: + f = csv.writer(tf) + for url, occurrences in Counter(urls_to_skip).items(): + if occurrences > 1: + logging.info(f"{url} appears {occurrences} times. Adding to archive list") + urls_to_archive.append(url) + else: + f.writerow([url]) + + # Get the URLS from the wayback APIs + wayback_dict = get_wayback_urls(query_urls, urls_to_archive, temp_dir) + write_wayback_to_file(filename, temp_dir + '/wayback_urls.csv') + + +def get_skipped_urls(temp_dir): + result = [] + if pathlib.Path(temp_dir + '/skipped_urls.csv').exists(): + with open(temp_dir + '/skipped_urls.csv', 'r') as f: + for row in f: + result.append(row) + return result + +def get_out_path(fp, out_dir): + '''Assumes that we want to keep the directory and the file name''' + if out_dir == None: + return fp + else: + new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:]) + if not new_path.parent.exists(): + new_path.parent.mkdir(parents = True) + return new_path + +def dict_from_temp(temp_file): + result = {} + if pathlib.Path(temp_file).exists(): + with open(temp_file, 'r') as fn: + f = csv.reader(fn) + for row in f: + result[row[0]] = row[1] + return result + +def get_wayback_urls(query_urls, urls_to_archive, temp_dir): + ''' + Takes in two lists of urls. The first are the URLs of the SERPS. + For these, we add a flag to the API to create archives of all outgoing links. + This should include many of the same links that we gathered, thus reducing the number + of calls that we need to make. + + Returns a dictionary of URLs and job ids which can be used to get the + archive.org URLs + ''' + + # Get job_ids and wayback_urls from the temp file + job_ids = dict_from_temp(temp_dir + '/job_ids.csv' ) + wayback_urls = dict_from_temp(temp_dir + '/wayback_urls.csv' ) + + # And save them as a class attributes + URLObj.job_ids = job_ids + URLObj.wayback_urls = wayback_urls + + # First, we need to get the job ids + url_obj_dict = {} + # Start with the query urls and get their job ids + print("Archiving {} query URLS".format(len(query_urls))), + + with open(temp_dir + '/job_ids.csv', 'a') as f: + temp = csv.writer(f) + i = 0 + for url in set(query_urls): + # Create a URL object + url_obj = URLObj(url, is_seed=True) + url_obj_dict[url] = url_obj + url_obj.archive_url() + # Save the url and job id to the temp file + if url not in URLObj.job_ids: + temp.writerow([url_obj.url, url_obj.job_id]) + i += 1 + if i % 100 == 0: + print(f"Archived {i} URLS") + + # Then, get the outlinks for each of the query urls + # We do this in stages, so that there is more time to finish the archiving, + # instead of waiting for each one sequentially. + i = 0 + for url_obj in url_obj_dict.values(): + i += 1 + if i % 100 == 0: + print(f"Got outlinks for {i} URLS") + curr_outlinks = url_obj.get_outlinks() + if curr_outlinks is None: + continue + for out_url, out_job_id in curr_outlinks: + # Use the same encoding that we'll use on the urls to archive, to make matching + # more likely + # More could be done here (e.g., removing parameters from URLs) + out_url = urlencode_url(out_url) + if out_url not in URLObj.job_ids: + # Save it to the class dictionary + URLObj.job_ids[out_url] = out_job_id + # And also to the temp file + temp.writerow([out_url, out_job_id]) + + # Next, created instances and get the job ids for the URLs retrieved in the SERPs. Hopefully we will already have + # some of these from the outlinks + print("Archiving {} result URLS".format(len(urls_to_archive))) + i = 0 + for url in set(urls_to_archive): + i += 1 + if i % 100 == 0: + print(f"Archived {i} URLS") + if url in url_obj_dict: + continue + url_obj = URLObj(url) + url_obj_dict[url] = url_obj + url_obj.archive_url() + if url not in URLObj.job_ids: + temp.writerow([url_obj.url, url_obj.job_id]) + + return url_obj_dict + +def get_domain(url): + domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0] + if not domain: + raise ValueError("Can't find URL in {url}") + return domain + +def filter_urls(urls, domain, remove_cache): + ''' + Separates urls into results and self-links, based on the domain. + Skips items from the two caches: + webcache.googleusercontent.com + https://cc.bingj + ''' + cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com' + result = [] + self_links = [] + for url in urls: + if remove_cache == True: + if re.match(cache_regex, url): + continue + if re.match(f'https?://\w+\.?{domain}', url): + self_links.append(url) + else: + result.append(url) + return (result, self_links) + +def get_urls_from_json(j_obj): + '''Takes a JSON object and extracts the correct URLs; returns them in a list.''' + query_url = j_obj['link'] + result = [] + + for x in j_obj['linkElements']: + url = x['href'] + if re.match('javascript', url) or url == '': + continue + result.append(url) + return (query_url, result) + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + + +class URLObj: + + def __init__(self, url, is_seed = False): + self.orig_url = url + self.url = urlencode_url(url) + self.job_id = self.check_for_job_id() + self.wayback_url = self.check_for_wayback_url() + self.is_seed = is_seed + self.status_attempts = 0 + self.archive_attempts = 0 + + def check_for_job_id(self): + for url in [self.url, self.orig_url]: + if url in self.job_ids: # Check in class variable list of job ids + return self.job_ids[url] + + + def check_for_wayback_url(self): + for url in [self.url, self.orig_url]: + if url in self.wayback_urls: # Check in class variable list of job ids + return self.wayback_urls[url] + + def _call_status_url(self, + wait = 2, # Initial wait time + max_wait = 7 # Stop when wait time between calls hits max_wait + ): + '''Helper function to handle the call to the status API''' + job_id = self.job_id + if job_id is None: + return None + s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS) + if s.status_code == 200: + s_json = s.json() + if s_json['status'] == 'pending': + if wait > max_wait: + logging.debug(s_json) + if self.status_attempts == 2: + logging.warning(f"The call to get the status of '{self.url}' with job id {self.job_id} failed three times. Skipping") + return None + self.status_attempts += 1 + self.job_id = None # Get new job id and try again + self.archive_url() + return self._call_status_url() + logging.info(f'Pending, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return self._call_status_url(wait = wait * 1.2) + if s_json['status'] == 'success': + return s_json + if s_json['status'] == 'error': + logging.error('Could not get status, with error: {}'.format(s_json["message"])) + return None + else: + logging.warning(s_json) + raise ValueError("Status was unexpected") + ## TODO: This error handling is horrible and is duplicated across the two calls. + ## I know there is a much better way to do this but I don't know what it is :) + if s.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait} seconds') + time.sleep(wait) + return self._call_status_url(wait = wait * 1.2) # Backoff + if s.status_code in [104,502,503,504,443,401]: + if s.status_code == 443 or s.status_code ==401: + self.status_attempts += 1 + logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds') + logging.warning(s.text) + time.sleep(30) + return self._call_status_url(wait) + else: + s.raise_for_status() + + def get_outlinks(self): + logging.info(f"Getting outlinks for {self.url} with job id {self.job_id}") + job_id = self.job_id + s_json = self._call_status_url() + if s_json is None: + return [] + if 'original_job_id' in s_json: + self.job_id = s_json['original_job_id'] + return self.get_outlinks() + + try: + return s_json['outlinks'].items() + except KeyError: + logging.warning(f"No outlinks for {self.url} but they were expected") + return [] + except AttributeError: + logging.info(f"Earlier job ({self.job_id}) didn't request outlinks for {self.url}") + return [] + + + def get_wayback_url(self): + if not self.wayback_url: + self._retrieve_wayback_url() + return self.wayback_url + + + def _retrieve_wayback_url(self): + logging.info(f"Getting wayback URL for {self.url} with job id {self.job_id}") + if not self.job_id: + self.archive_url() + job_id = self.job_id + s_json = self._call_status_url() + if s_json is None: + self.wayback_url = '' + return None + try: + self.wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'], + s_json['original_url']) + except KeyError: + logging.error(f"Missing timestamp or original URL for {job_id}") + self.wayback_url = None + + def archive_url(self, + wait = 2, + capture_screenshot = 0 # Whether to capture a screenshot (default is no) + ): + '''Archive the url in self.url and store the job_id in self.job_id''' + + + # If it already exists, then there's nothing to do + if self.job_id is not None: + logging.info(f'Job id already exists for {self.orig_url}') + return None + + logging.info(f"Archiving {self.orig_url}") + # If this is a query URL / seed URL, then capture outlinks + capture_outlinks = 1 if self.is_seed else 0 + if self.status_attempts == 1: # If we've already tried 2 times, then try w/o outlinks + capture_outlinks = 0 + + + payload = {'url': self.url, + 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN, + 'capture_screenshot': capture_screenshot, + 'capture_outlinks': capture_outlinks + } + r = requests.post(ENDPT, headers=HEADERS, data=payload) + logging.debug(r.content) + + if r.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return self.archive_url(wait = wait * 1.2) + if r.status_code in [104,502,503,504,443,401]: + if s.status_code in [104, 401, 443]: + self.archive_attempts += 1 + if self.archive_attempts > 3: + return None + logging.warning(self.url) + logging.warning(r.text) + logging.warning('502 or 503 or 504 status received; waiting 30 seconds') + time.sleep(30) + return self.archive_url() + r.raise_for_status() + self.job_id = r.json()['job_id'] + + + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() + From 8eaa486b3416f0883433a343e15d836e94f6bbc3 Mon Sep 17 00:00:00 2001 From: Jeremy Foote Date: Mon, 27 Apr 2020 15:05:28 -0400 Subject: [PATCH 2/3] Breaking wayback url retrieval into multiple parts. Worked on test set. Going to test on kibo now --- search_engine_results/add_wayback_urls.py | 464 ------------------ .../archive_urls-checkpoint.ipynb | 242 +++++++++ .../wayback_urls/add_wayback_urls.py | 86 ++++ .../wayback_urls/archive_urls.py | 212 ++++++++ .../wayback_urls/get_urls.py | 135 +++++ .../wayback_urls/wayback_urls.csv | 0 6 files changed, 675 insertions(+), 464 deletions(-) delete mode 100644 search_engine_results/add_wayback_urls.py create mode 100644 search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb create mode 100644 search_engine_results/wayback_urls/add_wayback_urls.py create mode 100644 search_engine_results/wayback_urls/archive_urls.py create mode 100644 search_engine_results/wayback_urls/get_urls.py create mode 100644 search_engine_results/wayback_urls/wayback_urls.csv diff --git a/search_engine_results/add_wayback_urls.py b/search_engine_results/add_wayback_urls.py deleted file mode 100644 index cf8fef3..0000000 --- a/search_engine_results/add_wayback_urls.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import pathlib -import pprint -import json -import re -import requests -import time -import config -import argparse -from collections import Counter -import logging -import urllib.parse -import csv - - -ENDPT = 'https://web.archive.org/save/' -UA_STRING = config.UA_STRING -ACCESS_KEY = config.ACCESS_KEY -SECRET_KEY = config.SECRET_KEY -HEADERS = {'Accept':'application/json', - 'User-Agent': UA_STRING, - 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'} -IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one - - -######## -# The goal of this program is to take all of the URLs from SERPs and archive them in the Wayback Machine, -# and then store the Wayback URLs as part of the SERP metadata. -# -# There is a lot of overlap in URLs so we store previous results in temporary files so we don't repeat the same calls -# -# Example usage: -# python3 add_wayback_urls.py -i /path/to/serps/dir -t ./tmp --ignore_self_links # Note that this would overwrite the json files in the /path/to/serps/dir directories -######## - - -def main(): - parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.') - parser.add_argument('-i', help='Input directory with metadata files') - parser.add_argument('-o', help='Location to save modified files (if blank, will overwrite)') - ## TODO: Maybe switch this so default is to ignore? - parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query', - action='store_true') - parser.add_argument('-t', help='Temp directory location (to save/load job ids, outlinks, and wayback URLs)') - - args = parser.parse_args() - - - # Make a list of the files that we are going to be editing (skip those already edited) - files = pathlib.Path(args.i).glob('**/*.json') - ## FOR TESTING ONLY!!! - #files = list(files)[10:11] - incomplete_files = list(files) - while len(incomplete_files) > 0: - for fn in incomplete_files: - try: - add_wayback_urls(fn, args.o, args.t, args.ignore_self_links) - incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list - except ConnectionError: - failed_files.append(fn) - - -def add_wayback_urls(filename, out_dir, temp_dir, ignore_self_links = False, remove_cache = True): - - def output_file_exists(f): - '''Says whether the output file for a given file exists, so that we can skip it if it's done. - Assumes that if the input file and the output file are the same, we want to overwrite the input file. - We therefore return False in this case''' - out_path = get_out_path(f, out_dir) - if out_path == f: - return False - return out_path.exists() - - - def write_wayback_to_file(filename, temp_file): - url_to_wb = {} - with open(filename, 'r') as f: - with open(temp_file, 'a') as tf: - tf_csv = csv.writer(tf) - j_obj = json.load(f) - query_url = j_obj['link'] - try: - wayback_url = wayback_dict[query_url].get_wayback_url() - j_obj['wayback_url'] = wayback_url - tf_csv.writerow([query_url, wayback_url]) - except KeyError: - logging.error(f"Should have an entry for {query_url}") - logging.error(wayback_dict.keys()) - j_obj['wayback_url'] = '' - for link_obj in j_obj['linkElements']: - link_url = link_obj['href'] - try: - wayback_url = wayback_dict[link_url].get_wayback_url() - link_obj['wayback_url'] = wayback_url - tf_csv.writerow([link_url, wayback_url]) - except KeyError: - if link_url in urls_to_archive: - # If it's in the urls to archive, then it should be in the dictionary. - logging.error(f"Should have an entry for {link_url}") - logging.error(wayback_dict.keys()) - logging.error(link_obj['href']) - link_obj['wayback_url'] = '' - outfile = get_out_path(filename, out_dir) - with open(outfile, 'w') as f: - json.dump(j_obj, f) - - tmp_dir = pathlib.Path(temp_dir) - if not tmp_dir.exists(): - tmp_dir.mkdir() - - # Query urls are the SERP query URLs; we get all of the outgoing links for these, to - # hopefully avoid duplication and having too many active jobs - query_urls =[] - urls_to_archive = [] - urls_to_skip = get_skipped_urls(temp_dir) # Skip these unless they appear > once - # First read the files and create a list of URLs to archive - with open(filename, 'r') as f: - j_obj = json.load(f) - # If this file already has wayback info, or if the output file has been created, - # the skip it - if output_file_exists(filename) or 'wayback_url' in j_obj: - return None - query_url, other_urls = get_urls_from_json(j_obj) - query_urls.append(query_url) - # Remove self links - if ignore_self_links: - domain = get_domain(query_url) - else: - domain = None - to_archive, to_skip = filter_urls(other_urls, domain, remove_cache) - print(to_archive) - print(to_skip) - urls_to_archive += to_archive - urls_to_skip += to_skip - - # For the URLs that we would otherwise skip, grab them if they occur - # more than once. Write the rest back to the temp file to check next time - with open(temp_dir + '/skipped_urls.csv', 'w') as tf: - f = csv.writer(tf) - for url, occurrences in Counter(urls_to_skip).items(): - if occurrences > 1: - logging.info(f"{url} appears {occurrences} times. Adding to archive list") - urls_to_archive.append(url) - else: - f.writerow([url]) - - # Get the URLS from the wayback APIs - wayback_dict = get_wayback_urls(query_urls, urls_to_archive, temp_dir) - write_wayback_to_file(filename, temp_dir + '/wayback_urls.csv') - - -def get_skipped_urls(temp_dir): - result = [] - if pathlib.Path(temp_dir + '/skipped_urls.csv').exists(): - with open(temp_dir + '/skipped_urls.csv', 'r') as f: - for row in f: - result.append(row) - return result - -def get_out_path(fp, out_dir): - '''Assumes that we want to keep the directory and the file name''' - if out_dir == None: - return fp - else: - new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:]) - if not new_path.parent.exists(): - new_path.parent.mkdir(parents = True) - return new_path - -def dict_from_temp(temp_file): - result = {} - if pathlib.Path(temp_file).exists(): - with open(temp_file, 'r') as fn: - f = csv.reader(fn) - for row in f: - result[row[0]] = row[1] - return result - -def get_wayback_urls(query_urls, urls_to_archive, temp_dir): - ''' - Takes in two lists of urls. The first are the URLs of the SERPS. - For these, we add a flag to the API to create archives of all outgoing links. - This should include many of the same links that we gathered, thus reducing the number - of calls that we need to make. - - Returns a dictionary of URLs and job ids which can be used to get the - archive.org URLs - ''' - - # Get job_ids and wayback_urls from the temp file - job_ids = dict_from_temp(temp_dir + '/job_ids.csv' ) - wayback_urls = dict_from_temp(temp_dir + '/wayback_urls.csv' ) - - # And save them as a class attributes - URLObj.job_ids = job_ids - URLObj.wayback_urls = wayback_urls - - # First, we need to get the job ids - url_obj_dict = {} - # Start with the query urls and get their job ids - print("Archiving {} query URLS".format(len(query_urls))), - - with open(temp_dir + '/job_ids.csv', 'a') as f: - temp = csv.writer(f) - i = 0 - for url in set(query_urls): - # Create a URL object - url_obj = URLObj(url, is_seed=True) - url_obj_dict[url] = url_obj - url_obj.archive_url() - # Save the url and job id to the temp file - if url not in URLObj.job_ids: - temp.writerow([url_obj.url, url_obj.job_id]) - i += 1 - if i % 100 == 0: - print(f"Archived {i} URLS") - - # Then, get the outlinks for each of the query urls - # We do this in stages, so that there is more time to finish the archiving, - # instead of waiting for each one sequentially. - i = 0 - for url_obj in url_obj_dict.values(): - i += 1 - if i % 100 == 0: - print(f"Got outlinks for {i} URLS") - curr_outlinks = url_obj.get_outlinks() - if curr_outlinks is None: - continue - for out_url, out_job_id in curr_outlinks: - # Use the same encoding that we'll use on the urls to archive, to make matching - # more likely - # More could be done here (e.g., removing parameters from URLs) - out_url = urlencode_url(out_url) - if out_url not in URLObj.job_ids: - # Save it to the class dictionary - URLObj.job_ids[out_url] = out_job_id - # And also to the temp file - temp.writerow([out_url, out_job_id]) - - # Next, created instances and get the job ids for the URLs retrieved in the SERPs. Hopefully we will already have - # some of these from the outlinks - print("Archiving {} result URLS".format(len(urls_to_archive))) - i = 0 - for url in set(urls_to_archive): - i += 1 - if i % 100 == 0: - print(f"Archived {i} URLS") - if url in url_obj_dict: - continue - url_obj = URLObj(url) - url_obj_dict[url] = url_obj - url_obj.archive_url() - if url not in URLObj.job_ids: - temp.writerow([url_obj.url, url_obj.job_id]) - - return url_obj_dict - -def get_domain(url): - domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0] - if not domain: - raise ValueError("Can't find URL in {url}") - return domain - -def filter_urls(urls, domain, remove_cache): - ''' - Separates urls into results and self-links, based on the domain. - Skips items from the two caches: - webcache.googleusercontent.com - https://cc.bingj - ''' - cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com' - result = [] - self_links = [] - for url in urls: - if remove_cache == True: - if re.match(cache_regex, url): - continue - if re.match(f'https?://\w+\.?{domain}', url): - self_links.append(url) - else: - result.append(url) - return (result, self_links) - -def get_urls_from_json(j_obj): - '''Takes a JSON object and extracts the correct URLs; returns them in a list.''' - query_url = j_obj['link'] - result = [] - - for x in j_obj['linkElements']: - url = x['href'] - if re.match('javascript', url) or url == '': - continue - result.append(url) - return (query_url, result) - -def urlencode_url(url): - return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) - - -class URLObj: - - def __init__(self, url, is_seed = False): - self.orig_url = url - self.url = urlencode_url(url) - self.job_id = self.check_for_job_id() - self.wayback_url = self.check_for_wayback_url() - self.is_seed = is_seed - self.status_attempts = 0 - self.archive_attempts = 0 - - def check_for_job_id(self): - for url in [self.url, self.orig_url]: - if url in self.job_ids: # Check in class variable list of job ids - return self.job_ids[url] - - - def check_for_wayback_url(self): - for url in [self.url, self.orig_url]: - if url in self.wayback_urls: # Check in class variable list of job ids - return self.wayback_urls[url] - - def _call_status_url(self, - wait = 2, # Initial wait time - max_wait = 7 # Stop when wait time between calls hits max_wait - ): - '''Helper function to handle the call to the status API''' - job_id = self.job_id - if job_id is None: - return None - s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS) - if s.status_code == 200: - s_json = s.json() - if s_json['status'] == 'pending': - if wait > max_wait: - logging.debug(s_json) - if self.status_attempts == 2: - logging.warning(f"The call to get the status of '{self.url}' with job id {self.job_id} failed three times. Skipping") - return None - self.status_attempts += 1 - self.job_id = None # Get new job id and try again - self.archive_url() - return self._call_status_url() - logging.info(f'Pending, now waiting for {wait:.2f} seconds') - time.sleep(wait) - return self._call_status_url(wait = wait * 1.2) - if s_json['status'] == 'success': - return s_json - if s_json['status'] == 'error': - logging.error('Could not get status, with error: {}'.format(s_json["message"])) - return None - else: - logging.warning(s_json) - raise ValueError("Status was unexpected") - ## TODO: This error handling is horrible and is duplicated across the two calls. - ## I know there is a much better way to do this but I don't know what it is :) - if s.status_code == 429: - logging.info(f'Hit rate limit, now waiting for {wait} seconds') - time.sleep(wait) - return self._call_status_url(wait = wait * 1.2) # Backoff - if s.status_code in [104,502,503,504,443,401]: - if s.status_code == 443 or s.status_code ==401: - self.status_attempts += 1 - logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds') - logging.warning(s.text) - time.sleep(30) - return self._call_status_url(wait) - else: - s.raise_for_status() - - def get_outlinks(self): - logging.info(f"Getting outlinks for {self.url} with job id {self.job_id}") - job_id = self.job_id - s_json = self._call_status_url() - if s_json is None: - return [] - if 'original_job_id' in s_json: - self.job_id = s_json['original_job_id'] - return self.get_outlinks() - - try: - return s_json['outlinks'].items() - except KeyError: - logging.warning(f"No outlinks for {self.url} but they were expected") - return [] - except AttributeError: - logging.info(f"Earlier job ({self.job_id}) didn't request outlinks for {self.url}") - return [] - - - def get_wayback_url(self): - if not self.wayback_url: - self._retrieve_wayback_url() - return self.wayback_url - - - def _retrieve_wayback_url(self): - logging.info(f"Getting wayback URL for {self.url} with job id {self.job_id}") - if not self.job_id: - self.archive_url() - job_id = self.job_id - s_json = self._call_status_url() - if s_json is None: - self.wayback_url = '' - return None - try: - self.wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'], - s_json['original_url']) - except KeyError: - logging.error(f"Missing timestamp or original URL for {job_id}") - self.wayback_url = None - - def archive_url(self, - wait = 2, - capture_screenshot = 0 # Whether to capture a screenshot (default is no) - ): - '''Archive the url in self.url and store the job_id in self.job_id''' - - - # If it already exists, then there's nothing to do - if self.job_id is not None: - logging.info(f'Job id already exists for {self.orig_url}') - return None - - logging.info(f"Archiving {self.orig_url}") - # If this is a query URL / seed URL, then capture outlinks - capture_outlinks = 1 if self.is_seed else 0 - if self.status_attempts == 1: # If we've already tried 2 times, then try w/o outlinks - capture_outlinks = 0 - - - payload = {'url': self.url, - 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN, - 'capture_screenshot': capture_screenshot, - 'capture_outlinks': capture_outlinks - } - r = requests.post(ENDPT, headers=HEADERS, data=payload) - logging.debug(r.content) - - if r.status_code == 429: - logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds') - time.sleep(wait) - return self.archive_url(wait = wait * 1.2) - if r.status_code in [104,502,503,504,443,401]: - if s.status_code in [104, 401, 443]: - self.archive_attempts += 1 - if self.archive_attempts > 3: - return None - logging.warning(self.url) - logging.warning(r.text) - logging.warning('502 or 503 or 504 status received; waiting 30 seconds') - time.sleep(30) - return self.archive_url() - r.raise_for_status() - self.job_id = r.json()['job_id'] - - - - -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) - main() - diff --git a/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb new file mode 100644 index 0000000..198e1a4 --- /dev/null +++ b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python\n", + "# coding: utf-8\n", + "\n", + "import pathlib\n", + "import pprint\n", + "import json\n", + "import re\n", + "import requests\n", + "import time\n", + "import config\n", + "import argparse\n", + "import logging\n", + "import urllib.parse\n", + "import csv\n", + "from datetime import datetime\n", + "\n", + "\n", + "ENDPT = 'https://web.archive.org/save/'\n", + "UA_STRING = config.UA_STRING\n", + "ACCESS_KEY = config.ACCESS_KEY\n", + "SECRET_KEY = config.SECRET_KEY\n", + "HEADERS = {'Accept':'application/json',\n", + " 'User-Agent': UA_STRING,\n", + " 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n", + "IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " parser = argparse.ArgumentParser(description='Creates job ')\n", + " parser.add_argument('-i', help='Input directory with metadata files')\n", + " parser.add_argument('-o', help='Location to save job id file')\n", + " ## TODO: Maybe switch this so default is to ignore?\n", + " parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n", + " action='store_true')\n", + "\n", + " args = parser.parse_args()\n", + "\n", + " \n", + " # Make a list of the files that we are going to be editing (skip those already edited)\n", + " files = pathlib.Path(args.i).glob('**/*.json')\n", + " ## FOR TESTING ONLY!!!\n", + " #files = list(files)[10:11]\n", + " archive_files(files, args.o, args.ignore_self_links)\n", + " \n", + "def archive_files(files, output_file, ignore_self_links):\n", + " \n", + " \n", + " def get_urls_to_archive(fn):\n", + " '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n", + " with open(filename, 'r') as f:\n", + " j_obj = json.load(f)\n", + " # Get the URLs from the file\n", + " query_url, link_urls = get_urls_from_json(j_obj)\n", + " # Filter out the self links and search engine cache urls\n", + " link_urls = filter_link_urls(query_url, link_urls)\n", + " \n", + " with open(output_file, 'w') as out_file:\n", + " f = csv.writer(out_file)\n", + " # Get outlinks for the query URL. This gets these jobs started early, so some will\n", + " # hopefully be done by the time we make the calls\n", + " query_job = archive_url(query_url, capture_outlinks=1)\n", + " store_job_id(f, query_url, query_job)\n", + " for url in link_urls:\n", + " job_id = archive_url(url)\n", + " store_job_id(f, url, job_id)\n", + " \n", + " def store_job_id(f, url, job_id):\n", + " '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n", + " time = datetime.now()\n", + " f.writerow([time, url, job_id])\n", + " completed_urls[url] = job_id\n", + " \n", + " def filter_link_urls(query_url,\n", + " urls,\n", + " remove_cache=True):\n", + " '''\n", + " Takes link urls and filters them in three ways:\n", + " 1. (Optionally) Ignores urls from the two caches:\n", + " webcache.googleusercontent.com\n", + " https://cc.bingj\n", + " 2. Filters out those which are in the completed_urls dictionary\n", + " 3. (Optionally) Identifies URLs which have the same domain as the query URL.\n", + " Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n", + " that we want it archived and move it from skipped to the to_archive list\n", + " '''\n", + " to_archive = []\n", + " if ignore_self_links:\n", + " domain = get_domain(query_url)\n", + " else:\n", + " domain = None\n", + " cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n", + " for url in urls:\n", + " if url in completed_urls:\n", + " continue\n", + "\n", + " if remove_cache == True:\n", + " if re.match(cache_regex, url):\n", + " continue\n", + "\n", + " if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n", + " # If it matches, check if it's in skipped URLs\n", + " # If so, remove it from there, and add it to the to_archive list\n", + " if url in skipped_urls:\n", + " to_archive.append(url)\n", + " skipped_urls.remove(url)\n", + " # Else, add it to the skipped urls (and skip it)\n", + " else:\n", + " skipped_urls.append(url)\n", + " else:\n", + " to_archive.append(url)\n", + " return to_archive\n", + " \n", + " \n", + " completed_urls = dict_from_csv(output_file)\n", + " skipped_urls = []\n", + " attempts = 0\n", + " incomplete_files = list(files)\n", + " while len(incomplete_files) > 0:\n", + " if attempts == 3:\n", + " break\n", + " for fn in incomplete_files:\n", + " try:\n", + " archive_urls(fn)\n", + " incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n", + " except ConnectionError:\n", + " failed_files.append(fn)\n", + " attempts += 1\n", + " logging.warn('Files that failed: {}'.format(incomplete_files))\n", + " time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n", + "\n", + "\n", + "def dict_from_csv(csv_file):\n", + " result = {}\n", + " if pathlib.Path(csv_file).exists():\n", + " with open(csv_file, 'r') as fn:\n", + " f = csv.reader(fn)\n", + " for row in f:\n", + " result[row[0]] = row[1]\n", + " return result \n", + "\n", + " \n", + "def get_domain(url):\n", + " domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n", + " if not domain:\n", + " raise ValueError(\"Can't find URL in {url}\")\n", + " return domain\n", + "\n", + "\n", + "\n", + "def get_urls_from_json(j_obj):\n", + " '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n", + " query_url = urlencode_url(j_obj['link'])\n", + " link_urls = []\n", + " \n", + " for x in j_obj['linkElements']:\n", + " url = x['href']\n", + " if re.match('javascript', url) or url == '':\n", + " continue\n", + " link_urls.append(urlencode_url(url))\n", + " return (query_url, link_urls)\n", + " \n", + "def urlencode_url(url):\n", + " return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n", + "\n", + "def archive_url(url, \n", + " wait = 2, \n", + " capture_outlinks = 0 # Whether to capture outlinks (default is no)\n", + " ):\n", + "\n", + "\n", + "\n", + " payload = {'url': url,\n", + " 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n", + " #'capture_screenshot': capture_screenshot,\n", + " 'capture_outlinks': capture_outlinks\n", + " }\n", + " r = requests.post(ENDPT, headers=HEADERS, data=payload)\n", + " logging.debug(r.content)\n", + " print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n", + "\n", + " if r.status_code == 429:\n", + " logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n", + " time.sleep(wait)\n", + " return archive_url(url = url,\n", + " wait = wait * 1.2, \n", + " capture_outlinks = capture_outlinks)\n", + " if r.status_code in [104,502,503,504,443,401]:\n", + " logging.warning(url)\n", + " logging.warning(r.text)\n", + " if r.status_code in [104, 401, 443]:\n", + " logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n", + " return None\n", + " logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n", + " time.sleep(30)\n", + " return archive_url(url = url,\n", + " capture_outlinks = capture_outlinks)\n", + " \n", + " r.raise_for_status()\n", + " try:\n", + " return r.json()['job_id']\n", + " except KeyError:\n", + " logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/search_engine_results/wayback_urls/add_wayback_urls.py b/search_engine_results/wayback_urls/add_wayback_urls.py new file mode 100644 index 0000000..4e35d4b --- /dev/null +++ b/search_engine_results/wayback_urls/add_wayback_urls.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import time +import argparse +import logging +import csv +import requests +import urllib + + + +def main(): + parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.') + parser.add_argument('-i', help='Input directory with metadata files') + parser.add_argument('-w', help = 'Location of file with wayback URLS') + parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)') + + args = parser.parse_args() + + + # Make a list of the files that we are going to be editing (skip those already edited) + files = pathlib.Path(args.i).glob('**/*.json') + wayback_dict = load_wayback_dict(args.w) + for fn in files: + write_wayback_to_file(fn, args.o, wayback_dict) + + +def write_wayback_to_file(filename, out_dir, wayback_dict): + with open(filename, 'r') as f: + j_obj = json.load(f) + query_url = urlencode_url(j_obj['link']) + try: + wayback_url = wayback_dict[query_url] + j_obj['wayback_url'] = wayback_url + except KeyError: + logging.error(f"Should have an entry for {query_url}") + logging.error(wayback_dict.keys()) + j_obj['wayback_url'] = '' + for link_obj in j_obj['linkElements']: + link_url = urlencode_url(link_obj['href']) + if link_url == '': + try: + wayback_url = wayback_dict[link_url] + link_obj['wayback_url'] = wayback_url + except KeyError: + logging.info(f'No WB URL for {link_url}') + link_obj['wayback_url'] = '' + outfile = get_out_path(filename, out_dir) + with open(outfile, 'w') as f: + json.dump(j_obj, f) + + +def get_out_path(fp, out_dir): + '''Assumes that we want to keep the directory and the file name''' + if out_dir == None: + return fp + else: + new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:]) + if not new_path.parent.exists(): + logging.warning(f"Creating new path at {new_path}") + new_path.parent.mkdir(parents = True) + return new_path + +def load_wayback_dict(fn): + '''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores + the timestamp, overwriting older WB URLs with newer ones''' + result = {} + if pathlib.Path(fn).exists(): + with open(fn, 'r') as f_obj: + f = csv.reader(f_obj) + for row in f: + result[row[1]] = row[2] + return result + + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() + diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py new file mode 100644 index 0000000..b0bacc6 --- /dev/null +++ b/search_engine_results/wayback_urls/archive_urls.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import requests +import time +import config +import argparse +import logging +import urllib.parse +import csv +import datetime + + +ENDPT = 'https://web.archive.org/save/' +UA_STRING = config.UA_STRING +ACCESS_KEY = config.ACCESS_KEY +SECRET_KEY = config.SECRET_KEY +HEADERS = {'Accept':'application/json', + 'User-Agent': UA_STRING, + 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'} +IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one +CHUNK_SIZE = 15 # Get this many URLS at a time + + + +def main(): + parser = argparse.ArgumentParser(description='Creates job ids from to_archive csv file') + parser.add_argument('-i', help='Location of to_archive CSV file') + parser.add_argument('-o', help='Location to save wayback URL file') + args = parser.parse_args() + + + + def get_job_ids(urls, capture_outlinks): + for url in urls: + if url not in completed_urls: + job_id = archive_url(url, + capture_outlinks = capture_outlinks) + # Just put them into the job_id_tuples + job_id_tuples.append((url, job_id)) + else: + print(f'{url} was in completed') + + + + def get_wayback_urls(out_file): + for url, job_id in job_id_tuples: + try: + wb_url, timestamp = get_wayback_url(job_id) + write_wayback(out_file, url, wb_url, timestamp) + except ConnectionError: + logging.warning(f'{url} with job id {job_id} failed with a ConnectionError') + except TypeError: + logging.warning(f'{url} with job id {job_id} did not get a WB URL') + + + completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN) + to_archive = load_urls(args.i) + # Do query URLS first, since for them we'll capture outlinks + query_urls = [x for x in to_archive if to_archive[x] == 'query'] + link_urls = [x for x in to_archive if to_archive[x] == 'link'] + + with open(args.o, 'a') as out_file: + out = csv.writer(out_file) + job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids) + get_job_ids(query_urls, capture_outlinks = 1) + get_wayback_urls(out) + for chunk in chunk_list(link_urls, CHUNK_SIZE): + job_id_tuples = [] + get_job_ids(chunk, capture_outlinks = 0) + get_wayback_urls(out) + + +def chunk_list(l, size): + for i in range(0, len(l), size): + logging.info(f'Now getting items {i} through {min(len(l), i + size)} of {len(l)}') + yield l[i:i+size] + +def load_urls(url_fn): + result = {} + with open(url_fn, 'r') as fn: + f = csv.reader(fn) + for row in f: + result[row[0]] = row[1] + return result + + +def write_wayback(f, url, wayback_url, timestamp): + '''Takes a CSV writer object, a url, and wayback_url, and writes + it out''' + f.writerow([timestamp,url,wayback_url]) + + + +def get_completed(csv_file, time_string): + '''Loads all of the completed URLs from the csv file. Takes in a time string like '20h', + strips the last character, and assumes that it refers to the number of hours. + Does not load any URLs older than that. + ''' + delta_hours = int(time_string[:-1]) + result = {} + if pathlib.Path(csv_file).exists(): + with open(csv_file, 'r') as fn: + f = csv.reader(fn) + for row in f: + dt = datetime.datetime.strptime(row[0], '%Y%m%d%H%M%S') + if datetime.datetime.now() - dt > datetime.timedelta(hours = delta_hours): + continue + else: + result[row[1]] = row[2] + return result + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + +def archive_url(url, + wait = 2, + capture_outlinks = 0 # Whether to capture outlinks (default is no) + ): + + logging.info(f'Sending archive call for {url}') + payload = {'url': url, + 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN, + #'capture_screenshot': capture_screenshot, + 'capture_outlinks': capture_outlinks + } + r = requests.post(ENDPT, headers=HEADERS, data=payload) + logging.debug(r.content) + + if r.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return archive_url(url = url, + wait = wait * 1.2, + capture_outlinks = capture_outlinks) + if r.status_code in [104,502,503,504,443,401]: + logging.warning(url) + logging.warning(r.text) + if r.status_code in [104, 401, 443]: + logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.') + return None + logging.warning('502 or 503 or 504 status received; waiting 30 seconds') + time.sleep(30) + return archive_url(url = url, + capture_outlinks = capture_outlinks) + + r.raise_for_status() + try: + return r.json()['job_id'] + except KeyError: + logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}') + + +def get_wayback_url(job_id): + + def call_status_url( + wait = 2, # Initial wait time + max_wait = 9 # Stop when wait time between calls hits max_wait + ): + '''Helper function to handle the call to the status API''' + if job_id is None: + return None + s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS) + if s.status_code == 200: + s_json = s.json() + if s_json['status'] == 'pending': + if wait > max_wait: + logging.debug(s_json) + logging.warning(f"The call to get the status of job id {job_id} failed. Skipping") + return None + logging.info(f'Pending, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return call_status_url(wait = wait + 1) + if s_json['status'] == 'success': + return s_json + if s_json['status'] == 'error': + logging.error('Could not get status, with error: {}'.format(s_json["message"])) + return None + else: + logging.warning(s_json) + raise ValueError("Status was unexpected") + if s.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait} seconds') + time.sleep(wait) + return call_status_url(wait = wait * 1.2) # Backoff + if s.status_code in [104,502,503,504,443,401]: + # These likely mean something's wrong; only try a few times + logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds') + logging.warning(s.text) + time.sleep(30) + return call_status_url() + else: + s.raise_for_status() + + logging.info(f"Getting wayback URL for job id {job_id}") + s_json = call_status_url() + if s_json is None: + return None + try: + wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'], + s_json['original_url']) + return (wayback_url, s_json['timestamp']) + except KeyError: + logging.error(f"Missing timestamp or original URL for {job_id}") + return None + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py new file mode 100644 index 0000000..6806aea --- /dev/null +++ b/search_engine_results/wayback_urls/get_urls.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import requests +import time +import config +import argparse +import logging +import urllib.parse +import csv +import datetime + + +def main(): + parser = argparse.ArgumentParser(description='Gets URLs to archive from SERP metadata files') + parser.add_argument('-i', help='Input directory with metadata files') + parser.add_argument('-o', help='Location to save URL list') + ## TODO: Maybe switch this so default is to ignore? + parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query', + action='store_true') + + args = parser.parse_args() + + # Make a list of the files that we are going to be editing (skip those already edited) + files = pathlib.Path(args.i).glob('**/*.json') + ## FOR TESTING ONLY!!! + #files = list(files)[10:11] + get_urls_from_files(files, args.o, args.ignore_self_links) + +def get_urls_from_files(files, + output_file, + ignore_self_links, + remove_cache = True): + + def get_urls(fn): + '''Takes a file, gets the urls to archive, passes them to the archive_url function, and writes them to + the output file''' + with open(fn, 'r') as f: + j_obj = json.load(f) + # Get the URLs from the file + query_url, link_urls = get_urls_from_json(j_obj) + # Filter out the self links and search engine cache urls + link_urls = filter_link_urls(query_url, link_urls) + return (query_url, link_urls) + + def filter_link_urls(query_url, + urls): + ''' + Takes link urls and filters them in four ways: + 1. (Optionally) Ignores urls from the two caches: + webcache.googleusercontent.com + https://cc.bingj + 2. Filters out those which are in the completed_urls dictionary + 3. (Optionally) Identifies URLs which have the same domain as the query URL. + Checks the skipped_urls list to see if the URL already appears there. If so, we assume + that we want it archived and move it from skipped to the to_archive list + 4. Filters out URLs that appear more than once in this list + ''' + if ignore_self_links: + domain = get_domain(query_url) + else: + domain = None + cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com' + + result = set() + for url in urls: + if url in to_archive: + continue + + if remove_cache == True: + if re.match(cache_regex, url): + continue + + if ignore_self_links and re.match(f'https?://\w*\.?{domain}', url): + # If it matches, check if it's in skipped URLs + # If so, remove it from there, and add it to the to_archive list + if url in skipped_urls: + result.add(url) + skipped_urls.remove(url) + # Else, add it to the skipped urls (and skip it) + else: + skipped_urls.add(url) + else: + result.add(url) + return result + + skipped_urls = set() + to_archive = {} + for fn in files: + q_url, link_urls = get_urls(fn) + to_archive[q_url] = 'query' + for url in link_urls: + # Prioritize query urls - if it's already there, + # then don't overwrite + if url not in to_archive: + to_archive[url] = 'link' + write_urls(to_archive, output_file) + + +def write_urls(url_dict, fn): + with open(fn, 'w') as out_file: + f = csv.writer(out_file) + for url, link_type in url_dict.items(): + f.writerow([url, link_type]) + + +def get_domain(url): + domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0] + if not domain: + raise ValueError("Can't find URL in {url}") + return domain + + +def get_urls_from_json(j_obj): + '''Takes a JSON object and extracts the correct URLs; returns them in a list.''' + query_url = urlencode_url(j_obj['link']) + link_urls = set() + + for x in j_obj['linkElements']: + url = x['href'] + if re.match('javascript', url) or url == '': + continue + link_urls.add(urlencode_url(url)) + return (query_url, link_urls) + + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + + +if __name__ == '__main__': + main() diff --git a/search_engine_results/wayback_urls/wayback_urls.csv b/search_engine_results/wayback_urls/wayback_urls.csv new file mode 100644 index 0000000..e69de29 From 86e4b8ecfce30211f78c30da17fae0c5e6882714 Mon Sep 17 00:00:00 2001 From: Jeremy Foote Date: Tue, 28 Apr 2020 15:26:08 -0400 Subject: [PATCH 3/3] Tweaking a few things based on test run on kibo --- .../wayback_urls/archive_urls.py | 32 +++++++++++++------ .../wayback_urls/get_urls.py | 2 +- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py index b0bacc6..3870a36 100644 --- a/search_engine_results/wayback_urls/archive_urls.py +++ b/search_engine_results/wayback_urls/archive_urls.py @@ -12,6 +12,7 @@ import urllib.parse import csv import datetime +from urllib import HTTPError ENDPT = 'https://web.archive.org/save/' @@ -42,7 +43,7 @@ def get_job_ids(urls, capture_outlinks): # Just put them into the job_id_tuples job_id_tuples.append((url, job_id)) else: - print(f'{url} was in completed') + logging.debug(f'{url} was in completed') @@ -55,6 +56,10 @@ def get_wayback_urls(out_file): logging.warning(f'{url} with job id {job_id} failed with a ConnectionError') except TypeError: logging.warning(f'{url} with job id {job_id} did not get a WB URL') + except HTTPError as e: + logging.warning(f'{url} with job id {job_id} failed with an uncaught HTTP Error: {e}') + except Exception as e: + logging.warning(f'{url} with job id {job_id} failed with an uncaught Exception: {e}') completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN) @@ -65,9 +70,13 @@ def get_wayback_urls(out_file): with open(args.o, 'a') as out_file: out = csv.writer(out_file) - job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids) - get_job_ids(query_urls, capture_outlinks = 1) - get_wayback_urls(out) + logging.info("Now retrieving query urls") + for q_chunk in chunk_list(query_urls, CHUNK_SIZE): + job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids) + get_job_ids(q_chunk, capture_outlinks = 1) + get_wayback_urls(out) + + logging.info("Now retrieving link urls") for chunk in chunk_list(link_urls, CHUNK_SIZE): job_id_tuples = [] get_job_ids(chunk, capture_outlinks = 0) @@ -117,7 +126,7 @@ def urlencode_url(url): return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) def archive_url(url, - wait = 2, + wait = 6, capture_outlinks = 0 # Whether to capture outlinks (default is no) ): @@ -136,7 +145,7 @@ def archive_url(url, return archive_url(url = url, wait = wait * 1.2, capture_outlinks = capture_outlinks) - if r.status_code in [104,502,503,504,443,401]: + if r.status_code in [104,401,404,443,502,503,504]: logging.warning(url) logging.warning(r.text) if r.status_code in [104, 401, 443]: @@ -157,8 +166,8 @@ def archive_url(url, def get_wayback_url(job_id): def call_status_url( - wait = 2, # Initial wait time - max_wait = 9 # Stop when wait time between calls hits max_wait + wait = 6, # Initial wait time + max_wait = 12 # Stop when wait time between calls hits max_wait ): '''Helper function to handle the call to the status API''' if job_id is None: @@ -186,8 +195,11 @@ def call_status_url( logging.info(f'Hit rate limit, now waiting for {wait} seconds') time.sleep(wait) return call_status_url(wait = wait * 1.2) # Backoff - if s.status_code in [104,502,503,504,443,401]: - # These likely mean something's wrong; only try a few times + if s.status_code in [104,401,404,443,502,503,504]: + # These likely mean something's wrong; wait and then try again + if r.status_code in [104, 401, 443]: + logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.') + return None logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds') logging.warning(s.text) time.sleep(30) diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py index 6806aea..fdf27d7 100644 --- a/search_engine_results/wayback_urls/get_urls.py +++ b/search_engine_results/wayback_urls/get_urls.py @@ -63,7 +63,7 @@ def filter_link_urls(query_url, domain = get_domain(query_url) else: domain = None - cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com' + cache_regex = r'https?://webcache.googleusercontent.com|https?://cc.bingj.com' result = set() for url in urls: