From cd51e48906f041f3cb13352b99a33d993c885688 Mon Sep 17 00:00:00 2001
From: Jeremy Foote <jdfoote1@gmail.com>
Date: Mon, 13 Apr 2020 12:28:02 -0400
Subject: [PATCH 1/3] Initial attempt to get wayback urls into SERP data.

---
 search_engine_results/.gitignore          |   2 +
 search_engine_results/add_wayback_urls.py | 464 ++++++++++++++++++++++
 2 files changed, 466 insertions(+)
 create mode 100644 search_engine_results/.gitignore
 create mode 100644 search_engine_results/add_wayback_urls.py

diff --git a/search_engine_results/.gitignore b/search_engine_results/.gitignore
new file mode 100644
index 0000000..382ddaa
--- /dev/null
+++ b/search_engine_results/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+config.py
diff --git a/search_engine_results/add_wayback_urls.py b/search_engine_results/add_wayback_urls.py
new file mode 100644
index 0000000..cf8fef3
--- /dev/null
+++ b/search_engine_results/add_wayback_urls.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import pprint
+import json
+import re
+import requests
+import time
+import config
+import argparse
+from collections import Counter
+import logging
+import urllib.parse
+import csv
+
+
+ENDPT = 'https://web.archive.org/save/'
+UA_STRING = config.UA_STRING
+ACCESS_KEY = config.ACCESS_KEY
+SECRET_KEY = config.SECRET_KEY
+HEADERS = {'Accept':'application/json',
+           'User-Agent': UA_STRING,
+           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}
+IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one
+
+
+########
+# The goal of this program is to take all of the URLs from SERPs and archive them in the Wayback Machine,
+# and then store the Wayback URLs as part of the SERP metadata.
+#
+# There is a lot of overlap in URLs so we store previous results in temporary files so we don't repeat the same calls
+#
+# Example usage: 
+# python3 add_wayback_urls.py -i /path/to/serps/dir -t ./tmp --ignore_self_links # Note that this would overwrite the json files in the /path/to/serps/dir directories
+########
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-o', help='Location to save modified files (if blank, will overwrite)')
+    ## TODO: Maybe switch this so default is to ignore?
+    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',
+            action='store_true')
+    parser.add_argument('-t', help='Temp directory location (to save/load job ids, outlinks, and wayback URLs)')
+
+    args = parser.parse_args()
+
+    
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    ## FOR TESTING ONLY!!!
+    #files = list(files)[10:11]
+    incomplete_files = list(files)
+    while len(incomplete_files) > 0:
+        for fn in incomplete_files:
+            try:
+                add_wayback_urls(fn, args.o, args.t, args.ignore_self_links)
+                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list
+            except ConnectionError:
+                failed_files.append(fn)
+
+
+def add_wayback_urls(filename, out_dir, temp_dir, ignore_self_links = False, remove_cache = True):
+
+    def output_file_exists(f):
+        '''Says whether the output file for a given file exists, so that we can skip it if it's done.
+        Assumes that if the input file and the output file are the same, we want to overwrite the input file.
+        We therefore return False in this case'''
+        out_path = get_out_path(f, out_dir)
+        if out_path == f:
+            return False
+        return out_path.exists()
+
+
+    def write_wayback_to_file(filename, temp_file):
+        url_to_wb = {}
+        with open(filename, 'r') as f:
+            with open(temp_file, 'a') as tf:
+                tf_csv = csv.writer(tf)
+                j_obj = json.load(f)
+                query_url = j_obj['link']
+                try:
+                    wayback_url = wayback_dict[query_url].get_wayback_url()
+                    j_obj['wayback_url'] = wayback_url
+                    tf_csv.writerow([query_url, wayback_url])
+                except KeyError:
+                    logging.error(f"Should have an entry for {query_url}")
+                    logging.error(wayback_dict.keys())
+                    j_obj['wayback_url'] = ''
+                for link_obj in j_obj['linkElements']:
+                    link_url = link_obj['href']
+                    try:
+                        wayback_url = wayback_dict[link_url].get_wayback_url()
+                        link_obj['wayback_url'] = wayback_url
+                        tf_csv.writerow([link_url, wayback_url])
+                    except KeyError:
+                        if link_url in urls_to_archive:
+                            # If it's in the urls to archive, then it should be in the dictionary.
+                            logging.error(f"Should have an entry for {link_url}")
+                            logging.error(wayback_dict.keys())
+                            logging.error(link_obj['href'])
+                        link_obj['wayback_url'] = ''
+        outfile = get_out_path(filename, out_dir)
+        with open(outfile, 'w') as f:
+            json.dump(j_obj, f)
+
+    tmp_dir =  pathlib.Path(temp_dir)
+    if not tmp_dir.exists():
+        tmp_dir.mkdir()
+
+    # Query urls are the SERP query URLs; we get all of the outgoing links for these, to
+    # hopefully avoid duplication and having too many active jobs
+    query_urls =[]
+    urls_to_archive = []
+    urls_to_skip = get_skipped_urls(temp_dir) # Skip these unless they appear > once
+    # First read the files and create a list of URLs to archive
+    with open(filename, 'r') as f:
+        j_obj = json.load(f)
+        # If this file already has wayback info, or if the output file has been created,
+        # the skip it
+        if output_file_exists(filename) or 'wayback_url' in j_obj:
+            return None
+        query_url, other_urls = get_urls_from_json(j_obj)
+        query_urls.append(query_url)
+        # Remove self links
+        if ignore_self_links:
+            domain = get_domain(query_url)
+        else:
+            domain = None
+        to_archive, to_skip = filter_urls(other_urls, domain, remove_cache)
+        print(to_archive)
+        print(to_skip)
+        urls_to_archive += to_archive
+        urls_to_skip += to_skip
+    
+    # For the URLs that we would otherwise skip, grab them if they occur
+    # more than once. Write the rest back to the temp file to check next time
+    with open(temp_dir + '/skipped_urls.csv', 'w') as tf:
+        f = csv.writer(tf)
+        for url, occurrences in Counter(urls_to_skip).items():
+            if occurrences > 1:
+                logging.info(f"{url} appears {occurrences} times. Adding to archive list")
+                urls_to_archive.append(url)
+            else:
+                f.writerow([url])
+    
+    # Get the URLS from the wayback APIs
+    wayback_dict = get_wayback_urls(query_urls, urls_to_archive, temp_dir)
+    write_wayback_to_file(filename, temp_dir + '/wayback_urls.csv')
+
+
+def get_skipped_urls(temp_dir):
+    result = []
+    if pathlib.Path(temp_dir + '/skipped_urls.csv').exists():
+        with open(temp_dir + '/skipped_urls.csv', 'r') as f:
+            for row in f:
+                result.append(row)
+    return result
+
+def get_out_path(fp, out_dir):
+    '''Assumes that we want to keep the directory and the file name'''
+    if out_dir == None:
+        return fp
+    else:
+        new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
+        if not new_path.parent.exists():
+            new_path.parent.mkdir(parents = True)
+        return new_path
+
+def dict_from_temp(temp_file):
+    result = {}
+    if pathlib.Path(temp_file).exists():
+        with open(temp_file, 'r') as fn:
+            f = csv.reader(fn)
+            for row in f:
+                result[row[0]] = row[1]
+    return result
+
+def get_wayback_urls(query_urls, urls_to_archive, temp_dir):
+    '''
+    Takes in two lists of urls. The first are the URLs of the SERPS.
+    For these, we add a flag to the API to create archives of all outgoing links.
+    This should include many of the same links that we gathered, thus reducing the number
+    of calls that we need to make.
+    
+    Returns a dictionary of URLs and job ids which can be used to get the
+    archive.org URLs
+    '''
+
+    # Get job_ids and wayback_urls from the temp file
+    job_ids = dict_from_temp(temp_dir + '/job_ids.csv' )
+    wayback_urls = dict_from_temp(temp_dir + '/wayback_urls.csv' )
+
+    # And save them as a class attributes
+    URLObj.job_ids = job_ids
+    URLObj.wayback_urls = wayback_urls
+
+    # First, we need to get the job ids
+    url_obj_dict = {}
+    # Start with the query urls and get their job ids
+    print("Archiving {} query URLS".format(len(query_urls))),
+
+    with open(temp_dir + '/job_ids.csv', 'a') as f:
+        temp = csv.writer(f)
+        i = 0
+        for url in set(query_urls):
+            # Create a URL object
+            url_obj = URLObj(url, is_seed=True)
+            url_obj_dict[url] = url_obj
+            url_obj.archive_url()
+            # Save the url and job id to the temp file
+            if url not in URLObj.job_ids:
+                temp.writerow([url_obj.url, url_obj.job_id])
+            i += 1
+            if i % 100 == 0:
+                print(f"Archived {i} URLS")
+            
+        # Then, get the outlinks for each of the query urls
+        # We do this in stages, so that there is more time to finish the archiving,
+        # instead of waiting for each one sequentially.
+        i = 0
+        for url_obj in url_obj_dict.values():
+            i += 1
+            if i % 100 == 0:
+                print(f"Got outlinks for {i} URLS")
+            curr_outlinks = url_obj.get_outlinks()
+            if curr_outlinks is None:
+                continue
+            for out_url, out_job_id in curr_outlinks:
+                # Use the same encoding that we'll use on the urls to archive, to make matching
+                # more likely
+                # More could be done here (e.g., removing parameters from URLs)
+                out_url = urlencode_url(out_url)
+                if out_url not in URLObj.job_ids:
+                    # Save it to the class dictionary
+                    URLObj.job_ids[out_url] = out_job_id
+                    # And also to the temp file
+                    temp.writerow([out_url, out_job_id])
+        
+        # Next, created instances and get the job ids for the URLs retrieved in the SERPs. Hopefully we will already have
+        # some of these from the outlinks
+        print("Archiving {} result URLS".format(len(urls_to_archive)))
+        i = 0
+        for url in set(urls_to_archive):
+            i += 1
+            if i % 100 == 0:
+                print(f"Archived {i} URLS")
+            if url in url_obj_dict:
+                continue
+            url_obj = URLObj(url)
+            url_obj_dict[url] = url_obj
+            url_obj.archive_url()
+            if url not in URLObj.job_ids:
+                temp.writerow([url_obj.url, url_obj.job_id])
+
+    return url_obj_dict
+
+def get_domain(url):
+    domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0]
+    if not domain:
+        raise ValueError("Can't find URL in {url}")
+    return domain
+
+def filter_urls(urls, domain, remove_cache):
+    '''
+    Separates urls into results and self-links, based on the domain.
+    Skips items from the two caches:
+    webcache.googleusercontent.com
+    https://cc.bingj
+    '''
+    cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'
+    result = []
+    self_links = []
+    for url in urls:
+        if remove_cache == True:
+            if re.match(cache_regex, url):
+                continue
+        if re.match(f'https?://\w+\.?{domain}', url):
+            self_links.append(url)
+        else:
+            result.append(url)
+    return (result, self_links)
+
+def get_urls_from_json(j_obj):
+    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''
+    query_url = j_obj['link']
+    result = []
+    
+    for x in j_obj['linkElements']:
+        url = x['href']
+        if re.match('javascript', url) or url == '':
+            continue
+        result.append(url)
+    return (query_url, result)
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+
+class URLObj:
+
+    def __init__(self, url, is_seed = False):
+        self.orig_url = url
+        self.url = urlencode_url(url)
+        self.job_id = self.check_for_job_id()
+        self.wayback_url = self.check_for_wayback_url()
+        self.is_seed = is_seed
+        self.status_attempts = 0
+        self.archive_attempts = 0
+
+    def check_for_job_id(self):
+        for url in [self.url, self.orig_url]:
+            if url in self.job_ids: # Check in class variable list of job ids
+                return self.job_ids[url]
+
+
+    def check_for_wayback_url(self):
+        for url in [self.url, self.orig_url]:
+            if url in self.wayback_urls: # Check in class variable list of job ids
+                return self.wayback_urls[url]
+
+    def _call_status_url(self,
+                         wait = 2, # Initial wait time
+                         max_wait = 7 # Stop when wait time between calls hits max_wait
+                        ):
+        '''Helper function to handle the call to the status API'''
+        job_id = self.job_id
+        if job_id is None:
+            return None
+        s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS)
+        if s.status_code == 200:
+            s_json = s.json()
+            if s_json['status'] == 'pending':
+                if wait > max_wait:
+                    logging.debug(s_json)
+                    if self.status_attempts == 2:
+                        logging.warning(f"The call to get the status of '{self.url}' with job id {self.job_id} failed three times. Skipping")
+                        return None
+                    self.status_attempts += 1
+                    self.job_id = None # Get new job id and try again
+                    self.archive_url()
+                    return self._call_status_url()
+                logging.info(f'Pending, now waiting for {wait:.2f} seconds')
+                time.sleep(wait)
+                return self._call_status_url(wait = wait * 1.2)
+            if s_json['status'] == 'success':
+                return s_json
+            if s_json['status'] == 'error':
+                logging.error('Could not get status, with error: {}'.format(s_json["message"]))
+                return None
+            else:
+                logging.warning(s_json)
+                raise ValueError("Status was unexpected")
+        ## TODO: This error handling is horrible and is duplicated across the two calls.
+        ## I know there is a much better way to do this but I don't know what it is :)
+        if s.status_code == 429:
+            logging.info(f'Hit rate limit, now waiting for {wait} seconds')
+            time.sleep(wait)
+            return self._call_status_url(wait = wait * 1.2) # Backoff
+        if s.status_code in [104,502,503,504,443,401]:
+            if s.status_code == 443 or s.status_code ==401:
+                self.status_attempts += 1
+            logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds')
+            logging.warning(s.text)
+            time.sleep(30)
+            return self._call_status_url(wait)
+        else:
+            s.raise_for_status()
+
+    def get_outlinks(self):
+        logging.info(f"Getting outlinks for {self.url} with job id {self.job_id}")
+        job_id = self.job_id
+        s_json = self._call_status_url()
+        if s_json is None:
+            return []
+        if 'original_job_id' in s_json:
+            self.job_id = s_json['original_job_id']
+            return self.get_outlinks()
+
+        try:
+            return s_json['outlinks'].items()
+        except KeyError:
+            logging.warning(f"No outlinks for {self.url} but they were expected")
+            return []
+        except AttributeError:
+            logging.info(f"Earlier job ({self.job_id}) didn't request outlinks for {self.url}")
+            return []
+
+
+    def get_wayback_url(self):
+        if not self.wayback_url:
+            self._retrieve_wayback_url()
+        return self.wayback_url
+
+
+    def _retrieve_wayback_url(self):
+        logging.info(f"Getting wayback URL for {self.url} with job id {self.job_id}")
+        if not self.job_id:
+            self.archive_url()
+        job_id = self.job_id
+        s_json = self._call_status_url()
+        if s_json is None:
+            self.wayback_url = ''
+            return None
+        try:
+            self.wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'],
+                                                             s_json['original_url'])
+        except KeyError:
+            logging.error(f"Missing timestamp or original URL for {job_id}")
+            self.wayback_url = None
+    
+    def archive_url(self,
+                    wait = 2,
+                    capture_screenshot = 0 # Whether to capture a screenshot (default is no)
+                    ):
+        '''Archive the url in self.url and store the job_id in self.job_id'''
+
+
+        # If it already exists, then there's nothing to do
+        if self.job_id is not None:
+            logging.info(f'Job id already exists for {self.orig_url}')
+            return None
+
+        logging.info(f"Archiving {self.orig_url}")
+        # If this is a query URL / seed URL, then capture outlinks
+        capture_outlinks =  1 if self.is_seed else 0
+        if self.status_attempts == 1: # If we've already tried 2 times, then try w/o outlinks
+            capture_outlinks = 0
+
+
+        payload = {'url': self.url,
+                  'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,
+                  'capture_screenshot': capture_screenshot,
+                  'capture_outlinks': capture_outlinks
+                  }
+        r = requests.post(ENDPT, headers=HEADERS, data=payload)
+        logging.debug(r.content)
+
+        if r.status_code == 429:
+            logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')
+            time.sleep(wait)
+            return self.archive_url(wait = wait * 1.2)
+        if r.status_code in [104,502,503,504,443,401]:
+            if s.status_code in [104, 401, 443]:
+                self.archive_attempts += 1
+            if self.archive_attempts > 3:
+                return None
+            logging.warning(self.url)
+            logging.warning(r.text)
+            logging.warning('502 or 503 or 504 status received; waiting 30 seconds')
+            time.sleep(30)
+            return self.archive_url()
+        r.raise_for_status()
+        self.job_id = r.json()['job_id']
+
+
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
+

From 8eaa486b3416f0883433a343e15d836e94f6bbc3 Mon Sep 17 00:00:00 2001
From: Jeremy Foote <jdfoote1@gmail.com>
Date: Mon, 27 Apr 2020 15:05:28 -0400
Subject: [PATCH 2/3] Breaking wayback url retrieval into multiple parts.
 Worked on test set. Going to test on kibo now

---
 search_engine_results/add_wayback_urls.py     | 464 ------------------
 .../archive_urls-checkpoint.ipynb             | 242 +++++++++
 .../wayback_urls/add_wayback_urls.py          |  86 ++++
 .../wayback_urls/archive_urls.py              | 212 ++++++++
 .../wayback_urls/get_urls.py                  | 135 +++++
 .../wayback_urls/wayback_urls.csv             |   0
 6 files changed, 675 insertions(+), 464 deletions(-)
 delete mode 100644 search_engine_results/add_wayback_urls.py
 create mode 100644 search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
 create mode 100644 search_engine_results/wayback_urls/add_wayback_urls.py
 create mode 100644 search_engine_results/wayback_urls/archive_urls.py
 create mode 100644 search_engine_results/wayback_urls/get_urls.py
 create mode 100644 search_engine_results/wayback_urls/wayback_urls.csv

diff --git a/search_engine_results/add_wayback_urls.py b/search_engine_results/add_wayback_urls.py
deleted file mode 100644
index cf8fef3..0000000
--- a/search_engine_results/add_wayback_urls.py
+++ /dev/null
@@ -1,464 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-import pathlib
-import pprint
-import json
-import re
-import requests
-import time
-import config
-import argparse
-from collections import Counter
-import logging
-import urllib.parse
-import csv
-
-
-ENDPT = 'https://web.archive.org/save/'
-UA_STRING = config.UA_STRING
-ACCESS_KEY = config.ACCESS_KEY
-SECRET_KEY = config.SECRET_KEY
-HEADERS = {'Accept':'application/json',
-           'User-Agent': UA_STRING,
-           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}
-IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one
-
-
-########
-# The goal of this program is to take all of the URLs from SERPs and archive them in the Wayback Machine,
-# and then store the Wayback URLs as part of the SERP metadata.
-#
-# There is a lot of overlap in URLs so we store previous results in temporary files so we don't repeat the same calls
-#
-# Example usage: 
-# python3 add_wayback_urls.py -i /path/to/serps/dir -t ./tmp --ignore_self_links # Note that this would overwrite the json files in the /path/to/serps/dir directories
-########
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
-    parser.add_argument('-i', help='Input directory with metadata files')
-    parser.add_argument('-o', help='Location to save modified files (if blank, will overwrite)')
-    ## TODO: Maybe switch this so default is to ignore?
-    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',
-            action='store_true')
-    parser.add_argument('-t', help='Temp directory location (to save/load job ids, outlinks, and wayback URLs)')
-
-    args = parser.parse_args()
-
-    
-    # Make a list of the files that we are going to be editing (skip those already edited)
-    files = pathlib.Path(args.i).glob('**/*.json')
-    ## FOR TESTING ONLY!!!
-    #files = list(files)[10:11]
-    incomplete_files = list(files)
-    while len(incomplete_files) > 0:
-        for fn in incomplete_files:
-            try:
-                add_wayback_urls(fn, args.o, args.t, args.ignore_self_links)
-                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list
-            except ConnectionError:
-                failed_files.append(fn)
-
-
-def add_wayback_urls(filename, out_dir, temp_dir, ignore_self_links = False, remove_cache = True):
-
-    def output_file_exists(f):
-        '''Says whether the output file for a given file exists, so that we can skip it if it's done.
-        Assumes that if the input file and the output file are the same, we want to overwrite the input file.
-        We therefore return False in this case'''
-        out_path = get_out_path(f, out_dir)
-        if out_path == f:
-            return False
-        return out_path.exists()
-
-
-    def write_wayback_to_file(filename, temp_file):
-        url_to_wb = {}
-        with open(filename, 'r') as f:
-            with open(temp_file, 'a') as tf:
-                tf_csv = csv.writer(tf)
-                j_obj = json.load(f)
-                query_url = j_obj['link']
-                try:
-                    wayback_url = wayback_dict[query_url].get_wayback_url()
-                    j_obj['wayback_url'] = wayback_url
-                    tf_csv.writerow([query_url, wayback_url])
-                except KeyError:
-                    logging.error(f"Should have an entry for {query_url}")
-                    logging.error(wayback_dict.keys())
-                    j_obj['wayback_url'] = ''
-                for link_obj in j_obj['linkElements']:
-                    link_url = link_obj['href']
-                    try:
-                        wayback_url = wayback_dict[link_url].get_wayback_url()
-                        link_obj['wayback_url'] = wayback_url
-                        tf_csv.writerow([link_url, wayback_url])
-                    except KeyError:
-                        if link_url in urls_to_archive:
-                            # If it's in the urls to archive, then it should be in the dictionary.
-                            logging.error(f"Should have an entry for {link_url}")
-                            logging.error(wayback_dict.keys())
-                            logging.error(link_obj['href'])
-                        link_obj['wayback_url'] = ''
-        outfile = get_out_path(filename, out_dir)
-        with open(outfile, 'w') as f:
-            json.dump(j_obj, f)
-
-    tmp_dir =  pathlib.Path(temp_dir)
-    if not tmp_dir.exists():
-        tmp_dir.mkdir()
-
-    # Query urls are the SERP query URLs; we get all of the outgoing links for these, to
-    # hopefully avoid duplication and having too many active jobs
-    query_urls =[]
-    urls_to_archive = []
-    urls_to_skip = get_skipped_urls(temp_dir) # Skip these unless they appear > once
-    # First read the files and create a list of URLs to archive
-    with open(filename, 'r') as f:
-        j_obj = json.load(f)
-        # If this file already has wayback info, or if the output file has been created,
-        # the skip it
-        if output_file_exists(filename) or 'wayback_url' in j_obj:
-            return None
-        query_url, other_urls = get_urls_from_json(j_obj)
-        query_urls.append(query_url)
-        # Remove self links
-        if ignore_self_links:
-            domain = get_domain(query_url)
-        else:
-            domain = None
-        to_archive, to_skip = filter_urls(other_urls, domain, remove_cache)
-        print(to_archive)
-        print(to_skip)
-        urls_to_archive += to_archive
-        urls_to_skip += to_skip
-    
-    # For the URLs that we would otherwise skip, grab them if they occur
-    # more than once. Write the rest back to the temp file to check next time
-    with open(temp_dir + '/skipped_urls.csv', 'w') as tf:
-        f = csv.writer(tf)
-        for url, occurrences in Counter(urls_to_skip).items():
-            if occurrences > 1:
-                logging.info(f"{url} appears {occurrences} times. Adding to archive list")
-                urls_to_archive.append(url)
-            else:
-                f.writerow([url])
-    
-    # Get the URLS from the wayback APIs
-    wayback_dict = get_wayback_urls(query_urls, urls_to_archive, temp_dir)
-    write_wayback_to_file(filename, temp_dir + '/wayback_urls.csv')
-
-
-def get_skipped_urls(temp_dir):
-    result = []
-    if pathlib.Path(temp_dir + '/skipped_urls.csv').exists():
-        with open(temp_dir + '/skipped_urls.csv', 'r') as f:
-            for row in f:
-                result.append(row)
-    return result
-
-def get_out_path(fp, out_dir):
-    '''Assumes that we want to keep the directory and the file name'''
-    if out_dir == None:
-        return fp
-    else:
-        new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
-        if not new_path.parent.exists():
-            new_path.parent.mkdir(parents = True)
-        return new_path
-
-def dict_from_temp(temp_file):
-    result = {}
-    if pathlib.Path(temp_file).exists():
-        with open(temp_file, 'r') as fn:
-            f = csv.reader(fn)
-            for row in f:
-                result[row[0]] = row[1]
-    return result
-
-def get_wayback_urls(query_urls, urls_to_archive, temp_dir):
-    '''
-    Takes in two lists of urls. The first are the URLs of the SERPS.
-    For these, we add a flag to the API to create archives of all outgoing links.
-    This should include many of the same links that we gathered, thus reducing the number
-    of calls that we need to make.
-    
-    Returns a dictionary of URLs and job ids which can be used to get the
-    archive.org URLs
-    '''
-
-    # Get job_ids and wayback_urls from the temp file
-    job_ids = dict_from_temp(temp_dir + '/job_ids.csv' )
-    wayback_urls = dict_from_temp(temp_dir + '/wayback_urls.csv' )
-
-    # And save them as a class attributes
-    URLObj.job_ids = job_ids
-    URLObj.wayback_urls = wayback_urls
-
-    # First, we need to get the job ids
-    url_obj_dict = {}
-    # Start with the query urls and get their job ids
-    print("Archiving {} query URLS".format(len(query_urls))),
-
-    with open(temp_dir + '/job_ids.csv', 'a') as f:
-        temp = csv.writer(f)
-        i = 0
-        for url in set(query_urls):
-            # Create a URL object
-            url_obj = URLObj(url, is_seed=True)
-            url_obj_dict[url] = url_obj
-            url_obj.archive_url()
-            # Save the url and job id to the temp file
-            if url not in URLObj.job_ids:
-                temp.writerow([url_obj.url, url_obj.job_id])
-            i += 1
-            if i % 100 == 0:
-                print(f"Archived {i} URLS")
-            
-        # Then, get the outlinks for each of the query urls
-        # We do this in stages, so that there is more time to finish the archiving,
-        # instead of waiting for each one sequentially.
-        i = 0
-        for url_obj in url_obj_dict.values():
-            i += 1
-            if i % 100 == 0:
-                print(f"Got outlinks for {i} URLS")
-            curr_outlinks = url_obj.get_outlinks()
-            if curr_outlinks is None:
-                continue
-            for out_url, out_job_id in curr_outlinks:
-                # Use the same encoding that we'll use on the urls to archive, to make matching
-                # more likely
-                # More could be done here (e.g., removing parameters from URLs)
-                out_url = urlencode_url(out_url)
-                if out_url not in URLObj.job_ids:
-                    # Save it to the class dictionary
-                    URLObj.job_ids[out_url] = out_job_id
-                    # And also to the temp file
-                    temp.writerow([out_url, out_job_id])
-        
-        # Next, created instances and get the job ids for the URLs retrieved in the SERPs. Hopefully we will already have
-        # some of these from the outlinks
-        print("Archiving {} result URLS".format(len(urls_to_archive)))
-        i = 0
-        for url in set(urls_to_archive):
-            i += 1
-            if i % 100 == 0:
-                print(f"Archived {i} URLS")
-            if url in url_obj_dict:
-                continue
-            url_obj = URLObj(url)
-            url_obj_dict[url] = url_obj
-            url_obj.archive_url()
-            if url not in URLObj.job_ids:
-                temp.writerow([url_obj.url, url_obj.job_id])
-
-    return url_obj_dict
-
-def get_domain(url):
-    domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0]
-    if not domain:
-        raise ValueError("Can't find URL in {url}")
-    return domain
-
-def filter_urls(urls, domain, remove_cache):
-    '''
-    Separates urls into results and self-links, based on the domain.
-    Skips items from the two caches:
-    webcache.googleusercontent.com
-    https://cc.bingj
-    '''
-    cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'
-    result = []
-    self_links = []
-    for url in urls:
-        if remove_cache == True:
-            if re.match(cache_regex, url):
-                continue
-        if re.match(f'https?://\w+\.?{domain}', url):
-            self_links.append(url)
-        else:
-            result.append(url)
-    return (result, self_links)
-
-def get_urls_from_json(j_obj):
-    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''
-    query_url = j_obj['link']
-    result = []
-    
-    for x in j_obj['linkElements']:
-        url = x['href']
-        if re.match('javascript', url) or url == '':
-            continue
-        result.append(url)
-    return (query_url, result)
-
-def urlencode_url(url):
-    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
-
-
-class URLObj:
-
-    def __init__(self, url, is_seed = False):
-        self.orig_url = url
-        self.url = urlencode_url(url)
-        self.job_id = self.check_for_job_id()
-        self.wayback_url = self.check_for_wayback_url()
-        self.is_seed = is_seed
-        self.status_attempts = 0
-        self.archive_attempts = 0
-
-    def check_for_job_id(self):
-        for url in [self.url, self.orig_url]:
-            if url in self.job_ids: # Check in class variable list of job ids
-                return self.job_ids[url]
-
-
-    def check_for_wayback_url(self):
-        for url in [self.url, self.orig_url]:
-            if url in self.wayback_urls: # Check in class variable list of job ids
-                return self.wayback_urls[url]
-
-    def _call_status_url(self,
-                         wait = 2, # Initial wait time
-                         max_wait = 7 # Stop when wait time between calls hits max_wait
-                        ):
-        '''Helper function to handle the call to the status API'''
-        job_id = self.job_id
-        if job_id is None:
-            return None
-        s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS)
-        if s.status_code == 200:
-            s_json = s.json()
-            if s_json['status'] == 'pending':
-                if wait > max_wait:
-                    logging.debug(s_json)
-                    if self.status_attempts == 2:
-                        logging.warning(f"The call to get the status of '{self.url}' with job id {self.job_id} failed three times. Skipping")
-                        return None
-                    self.status_attempts += 1
-                    self.job_id = None # Get new job id and try again
-                    self.archive_url()
-                    return self._call_status_url()
-                logging.info(f'Pending, now waiting for {wait:.2f} seconds')
-                time.sleep(wait)
-                return self._call_status_url(wait = wait * 1.2)
-            if s_json['status'] == 'success':
-                return s_json
-            if s_json['status'] == 'error':
-                logging.error('Could not get status, with error: {}'.format(s_json["message"]))
-                return None
-            else:
-                logging.warning(s_json)
-                raise ValueError("Status was unexpected")
-        ## TODO: This error handling is horrible and is duplicated across the two calls.
-        ## I know there is a much better way to do this but I don't know what it is :)
-        if s.status_code == 429:
-            logging.info(f'Hit rate limit, now waiting for {wait} seconds')
-            time.sleep(wait)
-            return self._call_status_url(wait = wait * 1.2) # Backoff
-        if s.status_code in [104,502,503,504,443,401]:
-            if s.status_code == 443 or s.status_code ==401:
-                self.status_attempts += 1
-            logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds')
-            logging.warning(s.text)
-            time.sleep(30)
-            return self._call_status_url(wait)
-        else:
-            s.raise_for_status()
-
-    def get_outlinks(self):
-        logging.info(f"Getting outlinks for {self.url} with job id {self.job_id}")
-        job_id = self.job_id
-        s_json = self._call_status_url()
-        if s_json is None:
-            return []
-        if 'original_job_id' in s_json:
-            self.job_id = s_json['original_job_id']
-            return self.get_outlinks()
-
-        try:
-            return s_json['outlinks'].items()
-        except KeyError:
-            logging.warning(f"No outlinks for {self.url} but they were expected")
-            return []
-        except AttributeError:
-            logging.info(f"Earlier job ({self.job_id}) didn't request outlinks for {self.url}")
-            return []
-
-
-    def get_wayback_url(self):
-        if not self.wayback_url:
-            self._retrieve_wayback_url()
-        return self.wayback_url
-
-
-    def _retrieve_wayback_url(self):
-        logging.info(f"Getting wayback URL for {self.url} with job id {self.job_id}")
-        if not self.job_id:
-            self.archive_url()
-        job_id = self.job_id
-        s_json = self._call_status_url()
-        if s_json is None:
-            self.wayback_url = ''
-            return None
-        try:
-            self.wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'],
-                                                             s_json['original_url'])
-        except KeyError:
-            logging.error(f"Missing timestamp or original URL for {job_id}")
-            self.wayback_url = None
-    
-    def archive_url(self,
-                    wait = 2,
-                    capture_screenshot = 0 # Whether to capture a screenshot (default is no)
-                    ):
-        '''Archive the url in self.url and store the job_id in self.job_id'''
-
-
-        # If it already exists, then there's nothing to do
-        if self.job_id is not None:
-            logging.info(f'Job id already exists for {self.orig_url}')
-            return None
-
-        logging.info(f"Archiving {self.orig_url}")
-        # If this is a query URL / seed URL, then capture outlinks
-        capture_outlinks =  1 if self.is_seed else 0
-        if self.status_attempts == 1: # If we've already tried 2 times, then try w/o outlinks
-            capture_outlinks = 0
-
-
-        payload = {'url': self.url,
-                  'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,
-                  'capture_screenshot': capture_screenshot,
-                  'capture_outlinks': capture_outlinks
-                  }
-        r = requests.post(ENDPT, headers=HEADERS, data=payload)
-        logging.debug(r.content)
-
-        if r.status_code == 429:
-            logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')
-            time.sleep(wait)
-            return self.archive_url(wait = wait * 1.2)
-        if r.status_code in [104,502,503,504,443,401]:
-            if s.status_code in [104, 401, 443]:
-                self.archive_attempts += 1
-            if self.archive_attempts > 3:
-                return None
-            logging.warning(self.url)
-            logging.warning(r.text)
-            logging.warning('502 or 503 or 504 status received; waiting 30 seconds')
-            time.sleep(30)
-            return self.archive_url()
-        r.raise_for_status()
-        self.job_id = r.json()['job_id']
-
-
-
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)
-    main()
-
diff --git a/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
new file mode 100644
index 0000000..198e1a4
--- /dev/null
+++ b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python\n",
+    "# coding: utf-8\n",
+    "\n",
+    "import pathlib\n",
+    "import pprint\n",
+    "import json\n",
+    "import re\n",
+    "import requests\n",
+    "import time\n",
+    "import config\n",
+    "import argparse\n",
+    "import logging\n",
+    "import urllib.parse\n",
+    "import csv\n",
+    "from datetime import datetime\n",
+    "\n",
+    "\n",
+    "ENDPT = 'https://web.archive.org/save/'\n",
+    "UA_STRING = config.UA_STRING\n",
+    "ACCESS_KEY = config.ACCESS_KEY\n",
+    "SECRET_KEY = config.SECRET_KEY\n",
+    "HEADERS = {'Accept':'application/json',\n",
+    "           'User-Agent': UA_STRING,\n",
+    "           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n",
+    "IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "    parser = argparse.ArgumentParser(description='Creates job ')\n",
+    "    parser.add_argument('-i', help='Input directory with metadata files')\n",
+    "    parser.add_argument('-o', help='Location to save job id file')\n",
+    "    ## TODO: Maybe switch this so default is to ignore?\n",
+    "    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n",
+    "            action='store_true')\n",
+    "\n",
+    "    args = parser.parse_args()\n",
+    "\n",
+    "    \n",
+    "    # Make a list of the files that we are going to be editing (skip those already edited)\n",
+    "    files = pathlib.Path(args.i).glob('**/*.json')\n",
+    "    ## FOR TESTING ONLY!!!\n",
+    "    #files = list(files)[10:11]\n",
+    "    archive_files(files, args.o, args.ignore_self_links)\n",
+    "    \n",
+    "def archive_files(files, output_file, ignore_self_links):\n",
+    "    \n",
+    "    \n",
+    "    def get_urls_to_archive(fn):\n",
+    "        '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n",
+    "        with open(filename, 'r') as f:\n",
+    "            j_obj = json.load(f)\n",
+    "            # Get the URLs from the file\n",
+    "            query_url, link_urls = get_urls_from_json(j_obj)\n",
+    "            # Filter out the self links and search engine cache urls\n",
+    "            link_urls = filter_link_urls(query_url, link_urls)\n",
+    "        \n",
+    "        with open(output_file, 'w') as out_file:\n",
+    "            f = csv.writer(out_file)\n",
+    "            # Get outlinks for the query URL. This gets these jobs started early, so some will\n",
+    "            # hopefully be done by the time we make the calls\n",
+    "            query_job = archive_url(query_url, capture_outlinks=1)\n",
+    "            store_job_id(f, query_url, query_job)\n",
+    "            for url in link_urls:\n",
+    "                job_id = archive_url(url)\n",
+    "                store_job_id(f, url, job_id)\n",
+    "    \n",
+    "    def store_job_id(f, url, job_id):\n",
+    "        '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n",
+    "        time = datetime.now()\n",
+    "        f.writerow([time, url, job_id])\n",
+    "        completed_urls[url] = job_id\n",
+    "    \n",
+    "    def filter_link_urls(query_url,\n",
+    "                         urls,\n",
+    "                    remove_cache=True):\n",
+    "        '''\n",
+    "        Takes link urls and filters them in three ways:\n",
+    "        1. (Optionally) Ignores urls from the two caches:\n",
+    "        webcache.googleusercontent.com\n",
+    "        https://cc.bingj\n",
+    "        2. Filters out those which are in the completed_urls dictionary\n",
+    "        3. (Optionally) Identifies URLs which have the same domain as the query URL.\n",
+    "        Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n",
+    "        that we want it archived and move it from skipped to the to_archive list\n",
+    "        '''\n",
+    "        to_archive = []\n",
+    "        if ignore_self_links:\n",
+    "            domain = get_domain(query_url)\n",
+    "        else:\n",
+    "            domain = None\n",
+    "        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n",
+    "        for url in urls:\n",
+    "            if url in completed_urls:\n",
+    "                continue\n",
+    "\n",
+    "            if remove_cache == True:\n",
+    "                if re.match(cache_regex, url):\n",
+    "                    continue\n",
+    "\n",
+    "            if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n",
+    "                # If it matches, check if it's in skipped URLs\n",
+    "                # If so, remove it from there, and add it to the to_archive list\n",
+    "                if url in skipped_urls:\n",
+    "                    to_archive.append(url)\n",
+    "                    skipped_urls.remove(url)\n",
+    "                # Else, add it to the skipped urls (and skip it)\n",
+    "                else:\n",
+    "                    skipped_urls.append(url)\n",
+    "            else:\n",
+    "                to_archive.append(url)\n",
+    "        return to_archive\n",
+    "   \n",
+    "    \n",
+    "    completed_urls = dict_from_csv(output_file)\n",
+    "    skipped_urls = []\n",
+    "    attempts = 0\n",
+    "    incomplete_files = list(files)\n",
+    "    while len(incomplete_files) > 0:\n",
+    "        if attempts == 3:\n",
+    "            break\n",
+    "        for fn in incomplete_files:\n",
+    "            try:\n",
+    "                archive_urls(fn)\n",
+    "                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n",
+    "            except ConnectionError:\n",
+    "                failed_files.append(fn)\n",
+    "        attempts += 1\n",
+    "        logging.warn('Files that failed: {}'.format(incomplete_files))\n",
+    "        time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n",
+    "\n",
+    "\n",
+    "def dict_from_csv(csv_file):\n",
+    "    result = {}\n",
+    "    if pathlib.Path(csv_file).exists():\n",
+    "        with open(csv_file, 'r') as fn:\n",
+    "            f = csv.reader(fn)\n",
+    "            for row in f:\n",
+    "                result[row[0]] = row[1]\n",
+    "    return result \n",
+    "\n",
+    "    \n",
+    "def get_domain(url):\n",
+    "    domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n",
+    "    if not domain:\n",
+    "        raise ValueError(\"Can't find URL in {url}\")\n",
+    "    return domain\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_urls_from_json(j_obj):\n",
+    "    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n",
+    "    query_url = urlencode_url(j_obj['link'])\n",
+    "    link_urls = []\n",
+    "    \n",
+    "    for x in j_obj['linkElements']:\n",
+    "        url = x['href']\n",
+    "        if re.match('javascript', url) or url == '':\n",
+    "            continue\n",
+    "        link_urls.append(urlencode_url(url))\n",
+    "    return (query_url, link_urls)\n",
+    "    \n",
+    "def urlencode_url(url):\n",
+    "    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n",
+    "\n",
+    "def archive_url(url, \n",
+    "                wait = 2,         \n",
+    "                capture_outlinks = 0 # Whether to capture outlinks (default is no)\n",
+    "                ):\n",
+    "\n",
+    "\n",
+    "\n",
+    "    payload = {'url': url,\n",
+    "              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n",
+    "              #'capture_screenshot': capture_screenshot,\n",
+    "              'capture_outlinks': capture_outlinks\n",
+    "              }\n",
+    "    r = requests.post(ENDPT, headers=HEADERS, data=payload)\n",
+    "    logging.debug(r.content)\n",
+    "    print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n",
+    "    if r.status_code == 429:\n",
+    "        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n",
+    "        time.sleep(wait)\n",
+    "        return archive_url(url = url,\n",
+    "                           wait = wait * 1.2, \n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "    if r.status_code in [104,502,503,504,443,401]:\n",
+    "        logging.warning(url)\n",
+    "        logging.warning(r.text)\n",
+    "        if r.status_code in [104, 401, 443]:\n",
+    "            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n",
+    "            return None\n",
+    "        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n",
+    "        time.sleep(30)\n",
+    "        return archive_url(url = url,\n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "                          \n",
+    "    r.raise_for_status()\n",
+    "    try:\n",
+    "        return r.json()['job_id']\n",
+    "    except KeyError:\n",
+    "        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py3",
+   "language": "python",
+   "name": "py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/search_engine_results/wayback_urls/add_wayback_urls.py b/search_engine_results/wayback_urls/add_wayback_urls.py
new file mode 100644
index 0000000..4e35d4b
--- /dev/null
+++ b/search_engine_results/wayback_urls/add_wayback_urls.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import time
+import argparse
+import logging
+import csv
+import requests
+import urllib
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-w', help = 'Location of file with wayback URLS')
+    parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)')
+
+    args = parser.parse_args()
+
+    
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    wayback_dict = load_wayback_dict(args.w)
+    for fn in files:
+        write_wayback_to_file(fn, args.o, wayback_dict)
+
+
+def write_wayback_to_file(filename, out_dir, wayback_dict):
+    with open(filename, 'r') as f:
+        j_obj = json.load(f)
+        query_url = urlencode_url(j_obj['link'])
+        try:
+            wayback_url = wayback_dict[query_url]
+            j_obj['wayback_url'] = wayback_url
+        except KeyError:
+            logging.error(f"Should have an entry for {query_url}")
+            logging.error(wayback_dict.keys())
+            j_obj['wayback_url'] = ''
+        for link_obj in j_obj['linkElements']:
+            link_url = urlencode_url(link_obj['href'])
+            if link_url == '':
+            try:
+                wayback_url = wayback_dict[link_url]
+                link_obj['wayback_url'] = wayback_url
+            except KeyError:
+                logging.info(f'No WB URL for {link_url}')
+                link_obj['wayback_url'] = ''
+    outfile = get_out_path(filename, out_dir)
+    with open(outfile, 'w') as f:
+        json.dump(j_obj, f)
+
+
+def get_out_path(fp, out_dir):
+    '''Assumes that we want to keep the directory and the file name'''
+    if out_dir == None:
+        return fp
+    else:
+        new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
+        if not new_path.parent.exists():
+            logging.warning(f"Creating new path at {new_path}")
+            new_path.parent.mkdir(parents = True)
+        return new_path
+
+def load_wayback_dict(fn):
+    '''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores
+    the timestamp, overwriting older WB URLs with newer ones'''
+    result = {}
+    if pathlib.Path(fn).exists():
+        with open(fn, 'r') as f_obj:
+            f = csv.reader(f_obj)
+            for row in f:
+                result[row[1]] = row[2]
+    return result
+
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
+
diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py
new file mode 100644
index 0000000..b0bacc6
--- /dev/null
+++ b/search_engine_results/wayback_urls/archive_urls.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import requests
+import time
+import config
+import argparse
+import logging
+import urllib.parse
+import csv
+import datetime
+
+
+ENDPT = 'https://web.archive.org/save/'
+UA_STRING = config.UA_STRING
+ACCESS_KEY = config.ACCESS_KEY
+SECRET_KEY = config.SECRET_KEY
+HEADERS = {'Accept':'application/json',
+           'User-Agent': UA_STRING,
+           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}
+IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one
+CHUNK_SIZE = 15 # Get this many URLS at a time
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Creates job ids from to_archive csv file')
+    parser.add_argument('-i', help='Location of to_archive CSV file')
+    parser.add_argument('-o', help='Location to save wayback URL file')
+    args = parser.parse_args()
+
+
+
+    def get_job_ids(urls, capture_outlinks):
+        for url in urls:
+            if url not in completed_urls:
+                job_id = archive_url(url,
+                        capture_outlinks = capture_outlinks)
+                # Just put them into the job_id_tuples
+                job_id_tuples.append((url, job_id))
+            else:
+                print(f'{url} was in completed')
+
+
+
+    def get_wayback_urls(out_file):
+        for url, job_id in job_id_tuples:
+            try:
+                wb_url, timestamp = get_wayback_url(job_id)
+                write_wayback(out_file, url, wb_url, timestamp)
+            except ConnectionError:
+                logging.warning(f'{url} with job id {job_id} failed with a ConnectionError')
+            except TypeError:
+                logging.warning(f'{url} with job id {job_id} did not get a WB URL')
+
+
+    completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN)
+    to_archive = load_urls(args.i)
+    # Do query URLS first, since for them we'll capture outlinks
+    query_urls = [x for x in to_archive if to_archive[x] == 'query']
+    link_urls = [x for x in to_archive if to_archive[x] == 'link']
+
+    with open(args.o, 'a') as out_file:
+        out = csv.writer(out_file)
+        job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids)
+        get_job_ids(query_urls, capture_outlinks = 1)
+        get_wayback_urls(out)
+        for chunk in chunk_list(link_urls, CHUNK_SIZE):
+            job_id_tuples = []
+            get_job_ids(chunk, capture_outlinks = 0)
+            get_wayback_urls(out)
+
+
+def chunk_list(l, size):
+    for i in range(0, len(l), size):
+        logging.info(f'Now getting items {i} through {min(len(l), i + size)} of {len(l)}')
+        yield l[i:i+size]
+
+def load_urls(url_fn):
+    result = {}
+    with open(url_fn, 'r') as fn:
+        f = csv.reader(fn)
+        for row in f:
+            result[row[0]] = row[1]
+    return result
+
+
+def write_wayback(f, url, wayback_url, timestamp):
+    '''Takes a CSV writer object, a url, and wayback_url, and writes
+    it out'''
+    f.writerow([timestamp,url,wayback_url])
+
+
+
+def get_completed(csv_file, time_string):
+    '''Loads all of the completed URLs from the csv file. Takes in a time string like '20h',
+    strips the last character, and assumes that it refers to the number of hours.
+    Does not load any URLs older than that.
+    '''
+    delta_hours = int(time_string[:-1])
+    result = {}
+    if pathlib.Path(csv_file).exists():
+        with open(csv_file, 'r') as fn:
+            f = csv.reader(fn)
+            for row in f:
+                dt = datetime.datetime.strptime(row[0], '%Y%m%d%H%M%S')
+                if datetime.datetime.now() - dt > datetime.timedelta(hours = delta_hours):
+                    continue
+                else:
+                    result[row[1]] = row[2]
+    return result
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+def archive_url(url,
+                wait = 2,
+                capture_outlinks = 0 # Whether to capture outlinks (default is no)
+                ):
+
+    logging.info(f'Sending archive call for {url}')
+    payload = {'url': url,
+              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,
+              #'capture_screenshot': capture_screenshot,
+              'capture_outlinks': capture_outlinks
+              }
+    r = requests.post(ENDPT, headers=HEADERS, data=payload)
+    logging.debug(r.content)
+
+    if r.status_code == 429:
+        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')
+        time.sleep(wait)
+        return archive_url(url = url,
+                           wait = wait * 1.2, 
+                           capture_outlinks = capture_outlinks)
+    if r.status_code in [104,502,503,504,443,401]:
+        logging.warning(url)
+        logging.warning(r.text)
+        if r.status_code in [104, 401, 443]:
+            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')
+            return None
+        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')
+        time.sleep(30)
+        return archive_url(url = url,
+                           capture_outlinks = capture_outlinks)
+
+    r.raise_for_status()
+    try:
+        return r.json()['job_id']
+    except KeyError:
+        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}')
+
+
+def get_wayback_url(job_id):
+
+    def call_status_url(
+                         wait = 2, # Initial wait time
+                         max_wait = 9 # Stop when wait time between calls hits max_wait
+                        ):
+        '''Helper function to handle the call to the status API'''
+        if job_id is None:
+            return None
+        s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS)
+        if s.status_code == 200:
+            s_json = s.json()
+            if s_json['status'] == 'pending':
+                if wait > max_wait:
+                    logging.debug(s_json)
+                    logging.warning(f"The call to get the status of job id {job_id} failed. Skipping")
+                    return None
+                logging.info(f'Pending, now waiting for {wait:.2f} seconds')
+                time.sleep(wait)
+                return call_status_url(wait = wait + 1)
+            if s_json['status'] == 'success':
+                return s_json
+            if s_json['status'] == 'error':
+                logging.error('Could not get status, with error: {}'.format(s_json["message"]))
+                return None
+            else:
+                logging.warning(s_json)
+                raise ValueError("Status was unexpected")
+        if s.status_code == 429:
+            logging.info(f'Hit rate limit, now waiting for {wait} seconds')
+            time.sleep(wait)
+            return call_status_url(wait = wait * 1.2) # Backoff
+        if s.status_code in [104,502,503,504,443,401]:
+            # These likely mean something's wrong; only try a few times
+            logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds')
+            logging.warning(s.text)
+            time.sleep(30)
+            return call_status_url()
+        else:
+            s.raise_for_status()
+
+    logging.info(f"Getting wayback URL for job id {job_id}")
+    s_json = call_status_url()
+    if s_json is None:
+        return None
+    try:
+        wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'],
+                                                         s_json['original_url'])
+        return (wayback_url, s_json['timestamp'])
+    except KeyError:
+        logging.error(f"Missing timestamp or original URL for {job_id}")
+        return None
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py
new file mode 100644
index 0000000..6806aea
--- /dev/null
+++ b/search_engine_results/wayback_urls/get_urls.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import requests
+import time
+import config
+import argparse
+import logging
+import urllib.parse
+import csv
+import datetime
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Gets URLs to archive from SERP metadata files')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-o', help='Location to save URL list')
+    ## TODO: Maybe switch this so default is to ignore?
+    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',
+            action='store_true')
+
+    args = parser.parse_args()
+
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    ## FOR TESTING ONLY!!!
+    #files = list(files)[10:11]
+    get_urls_from_files(files, args.o, args.ignore_self_links)
+
+def get_urls_from_files(files,
+        output_file,
+        ignore_self_links,
+        remove_cache = True):
+
+    def get_urls(fn):
+        '''Takes a file, gets the urls to archive, passes them to the archive_url function, and writes them to
+        the output file'''
+        with open(fn, 'r') as f:
+            j_obj = json.load(f)
+            # Get the URLs from the file
+            query_url, link_urls = get_urls_from_json(j_obj)
+            # Filter out the self links and search engine cache urls
+            link_urls = filter_link_urls(query_url, link_urls)
+            return (query_url, link_urls)
+
+    def filter_link_urls(query_url,
+                         urls):
+        '''
+        Takes link urls and filters them in four ways:
+        1. (Optionally) Ignores urls from the two caches:
+        webcache.googleusercontent.com
+        https://cc.bingj
+        2. Filters out those which are in the completed_urls dictionary
+        3. (Optionally) Identifies URLs which have the same domain as the query URL.
+        Checks the skipped_urls list to see if the URL already appears there. If so, we assume
+        that we want it archived and move it from skipped to the to_archive list
+        4. Filters out URLs that appear more than once in this list
+        '''
+        if ignore_self_links:
+            domain = get_domain(query_url)
+        else:
+            domain = None
+        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'
+
+        result = set()
+        for url in urls:
+            if url in to_archive:
+                continue
+
+            if remove_cache == True:
+                if re.match(cache_regex, url):
+                    continue
+
+            if ignore_self_links and re.match(f'https?://\w*\.?{domain}', url):
+                # If it matches, check if it's in skipped URLs
+                # If so, remove it from there, and add it to the to_archive list
+                if url in skipped_urls:
+                    result.add(url)
+                    skipped_urls.remove(url)
+                # Else, add it to the skipped urls (and skip it)
+                else:
+                    skipped_urls.add(url)
+            else:
+                result.add(url)
+        return result
+
+    skipped_urls = set()
+    to_archive = {}
+    for fn in files:
+        q_url, link_urls = get_urls(fn)
+        to_archive[q_url] = 'query'
+        for url in link_urls:
+            # Prioritize query urls - if it's already there,
+            # then don't overwrite
+            if url not in to_archive:
+                to_archive[url] = 'link'
+    write_urls(to_archive, output_file)
+
+
+def write_urls(url_dict, fn):
+    with open(fn, 'w') as out_file:
+        f = csv.writer(out_file)
+        for url, link_type in url_dict.items():
+            f.writerow([url, link_type])
+
+
+def get_domain(url):
+    domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0]
+    if not domain:
+        raise ValueError("Can't find URL in {url}")
+    return domain
+
+
+def get_urls_from_json(j_obj):
+    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''
+    query_url = urlencode_url(j_obj['link'])
+    link_urls = set()
+
+    for x in j_obj['linkElements']:
+        url = x['href']
+        if re.match('javascript', url) or url == '':
+            continue
+        link_urls.add(urlencode_url(url))
+    return (query_url, link_urls)
+
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/search_engine_results/wayback_urls/wayback_urls.csv b/search_engine_results/wayback_urls/wayback_urls.csv
new file mode 100644
index 0000000..e69de29

From 86e4b8ecfce30211f78c30da17fae0c5e6882714 Mon Sep 17 00:00:00 2001
From: Jeremy Foote <jdfoote1@gmail.com>
Date: Tue, 28 Apr 2020 15:26:08 -0400
Subject: [PATCH 3/3] Tweaking a few things based on test run on kibo

---
 .../wayback_urls/archive_urls.py              | 32 +++++++++++++------
 .../wayback_urls/get_urls.py                  |  2 +-
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py
index b0bacc6..3870a36 100644
--- a/search_engine_results/wayback_urls/archive_urls.py
+++ b/search_engine_results/wayback_urls/archive_urls.py
@@ -12,6 +12,7 @@
 import urllib.parse
 import csv
 import datetime
+from urllib import HTTPError
 
 
 ENDPT = 'https://web.archive.org/save/'
@@ -42,7 +43,7 @@ def get_job_ids(urls, capture_outlinks):
                 # Just put them into the job_id_tuples
                 job_id_tuples.append((url, job_id))
             else:
-                print(f'{url} was in completed')
+                logging.debug(f'{url} was in completed')
 
 
 
@@ -55,6 +56,10 @@ def get_wayback_urls(out_file):
                 logging.warning(f'{url} with job id {job_id} failed with a ConnectionError')
             except TypeError:
                 logging.warning(f'{url} with job id {job_id} did not get a WB URL')
+            except HTTPError as e:
+                logging.warning(f'{url} with job id {job_id} failed with an uncaught HTTP Error: {e}')
+            except Exception as e:
+                logging.warning(f'{url} with job id {job_id} failed with an uncaught Exception: {e}')
 
 
     completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN)
@@ -65,9 +70,13 @@ def get_wayback_urls(out_file):
 
     with open(args.o, 'a') as out_file:
         out = csv.writer(out_file)
-        job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids)
-        get_job_ids(query_urls, capture_outlinks = 1)
-        get_wayback_urls(out)
+        logging.info("Now retrieving query urls")
+        for q_chunk in chunk_list(query_urls, CHUNK_SIZE):
+            job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids)
+            get_job_ids(q_chunk, capture_outlinks = 1)
+            get_wayback_urls(out)
+
+        logging.info("Now retrieving link urls")
         for chunk in chunk_list(link_urls, CHUNK_SIZE):
             job_id_tuples = []
             get_job_ids(chunk, capture_outlinks = 0)
@@ -117,7 +126,7 @@ def urlencode_url(url):
     return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
 
 def archive_url(url,
-                wait = 2,
+                wait = 6,
                 capture_outlinks = 0 # Whether to capture outlinks (default is no)
                 ):
 
@@ -136,7 +145,7 @@ def archive_url(url,
         return archive_url(url = url,
                            wait = wait * 1.2, 
                            capture_outlinks = capture_outlinks)
-    if r.status_code in [104,502,503,504,443,401]:
+    if r.status_code in [104,401,404,443,502,503,504]:
         logging.warning(url)
         logging.warning(r.text)
         if r.status_code in [104, 401, 443]:
@@ -157,8 +166,8 @@ def archive_url(url,
 def get_wayback_url(job_id):
 
     def call_status_url(
-                         wait = 2, # Initial wait time
-                         max_wait = 9 # Stop when wait time between calls hits max_wait
+                         wait = 6, # Initial wait time
+                         max_wait = 12 # Stop when wait time between calls hits max_wait
                         ):
         '''Helper function to handle the call to the status API'''
         if job_id is None:
@@ -186,8 +195,11 @@ def call_status_url(
             logging.info(f'Hit rate limit, now waiting for {wait} seconds')
             time.sleep(wait)
             return call_status_url(wait = wait * 1.2) # Backoff
-        if s.status_code in [104,502,503,504,443,401]:
-            # These likely mean something's wrong; only try a few times
+        if s.status_code in [104,401,404,443,502,503,504]:
+        # These likely mean something's wrong; wait and then try again
+            if r.status_code in [104, 401, 443]:
+                logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')
+                return None
             logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds')
             logging.warning(s.text)
             time.sleep(30)
diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py
index 6806aea..fdf27d7 100644
--- a/search_engine_results/wayback_urls/get_urls.py
+++ b/search_engine_results/wayback_urls/get_urls.py
@@ -63,7 +63,7 @@ def filter_link_urls(query_url,
             domain = get_domain(query_url)
         else:
             domain = None
-        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'
+        cache_regex = r'https?://webcache.googleusercontent.com|https?://cc.bingj.com'
 
         result = set()
         for url in urls: