diff --git a/CMakeLists.txt b/CMakeLists.txt
index b65c8a1e2..8ee92a604 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,10 +69,9 @@ add_definitions(-DFAASM_PROFILE_ON=0)
 # Custom LLVM build (also for profiling)
 set(FAASM_CUSTOM_LLVM 0)
 if (${FAASM_CUSTOM_LLVM})
-    message(STATUS "Using custom LLVM")
-
-    # Force using custom build of LLVM for codegen
-    set(LLVM_DIR $ENV{HOME}/faasm/llvm/build/lib/cmake/llvm)
+    # In accordance with bin/build_llvm_perf.sh and LLVM version for WAVM
+    set(LLVM_DIR /usr/local/code/llvm-perf/build/lib/cmake/llvm)
+    message(STATUS "Using custom LLVM at ${LLVM_DIR} for profiling")
     find_package(LLVM REQUIRED CONFIG NO_DEFAULT_PATH)
 
     set(WAVM_PERF_LIBS 1)
diff --git a/ansible/musl_native.yml b/ansible/musl_native.yml
new file mode 100644
index 000000000..2897a9238
--- /dev/null
+++ b/ansible/musl_native.yml
@@ -0,0 +1,6 @@
+---
+
+- hosts: localhost
+  gather_facts: yes
+  roles:
+    - musl_native
diff --git a/ansible/roles/musl_native/tasks/main.yml b/ansible/roles/musl_native/tasks/main.yml
new file mode 100644
index 000000000..01087c52a
--- /dev/null
+++ b/ansible/roles/musl_native/tasks/main.yml
@@ -0,0 +1,26 @@
+---
+
+- name: "Clone musl"
+  git:
+    repo: 'git://git.musl-libc.org/musl'
+    dest: /usr/local/code/musl_native
+    version: v1.1.24
+
+- name: "Build"
+  shell: "{{ item }}"
+  args:
+    chdir: "/usr/local/code/musl_native"
+  with_items:
+    - "CC=/usr/bin/clang ./configure --prefix=/usr/local/faasm/musl_native"
+    - "make -j 10"
+
+- name: "Install"
+  become: yes
+  shell: "{{ item }}"
+  args:
+    chdir: "/usr/local/code/musl_native"
+  with_items:
+    - "mkdir -p /usr/local/faasm/musl_native"
+    - "sudo make install"
+
+
diff --git a/bin/build_llvm_perf.sh b/bin/build_llvm_perf.sh
index fda8d0e9e..aa82317d4 100755
--- a/bin/build_llvm_perf.sh
+++ b/bin/build_llvm_perf.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
 set -e
+set -x
 
-FAASM_DIR=${HOME}/faasm
-LLVM_DIR=${FAASM_DIR}/llvm
+CODE_DIR=/usr/local/code/
+if [[ ! -d ${CODE_DIR} ]]; then
+  echo "CODE_DIR (${CODE_DIR}) does not exists"
+  exit 1
+fi
+
+LLVM_DIR=${CODE_DIR}/llvm-perf
 BUILD_DIR=${LLVM_DIR}/build
 
-# Clone LLVM repo if not done already
-mkdir -p ${FAASM_DIR}
+# Clone LLVM-9 repo if not done already. Version must be compatible with WAVM
 if [[ ! -d "$LLVM_DIR" ]]; then
     git clone https://github.com/llvm/llvm-project ${LLVM_DIR}
     pushd ${LLVM_DIR}
@@ -14,17 +19,20 @@ if [[ ! -d "$LLVM_DIR" ]]; then
     popd
 fi
 
-# Clear build dir
-# rm -rf ${BUILD_DIR}
-
-# Make the build dir if it exists
+# Make the build dir
 mkdir -p ${BUILD_DIR}
-
 pushd ${BUILD_DIR}
 
+# Clear build dir
+if [[ -f "${BUILD_DIR}/build.ninja" ]]; then
+  ninja clean
+fi
+
 # Enable JIT profiling features of LLVM
 cmake \
 	-G Ninja \
+	-DCMAKE_C_COMPILER=/usr/bin/clang \
+	-DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
 	-DCMAKE_BUILD_TYPE=RelWithDebInfo \
 	-DLLVM_USE_PERF=1 \
 	-DLLVM_USE_INTEL_JITEVENTS=1 \
diff --git a/docker/omp-musl.dockerfile b/docker/omp-musl.dockerfile
new file mode 100644
index 000000000..f0f6c3478
--- /dev/null
+++ b/docker/omp-musl.dockerfile
@@ -0,0 +1,13 @@
+FROM alpine
+
+RUN apk update && \
+    apk add g++ libgomp make
+
+# Compile omp programs
+COPY func/omp /build
+WORKDIR /build
+RUN make
+
+# clean up
+RUN rm -rf /build
+WORKDIR /
diff --git a/docs/perf-diff/explaination.md b/docs/perf-diff/explaination.md
new file mode 100644
index 000000000..9c6d6238d
--- /dev/null
+++ b/docs/perf-diff/explaination.md
@@ -0,0 +1,142 @@
+# Perf shows that the loops are doing the same
+
+## Log function
+
+Experiment code [here](../../func/demo/log_difference.cpp).
+
+### Perf stat
+
+#### Native
+
+```
+ Performance counter stats for 'speed_difference':
+
+      14530.607857      task-clock (msec)         #    1.000 CPUs utilized
+                 7      context-switches          #    0.000 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+               589      page-faults               #    0.041 K/sec
+    46,636,539,230      cycles                    #    3.210 GHz
+   114,771,451,699      instructions              #    2.46  insn per cycle
+    13,208,253,517      branches                  #  908.995 M/sec
+           196,535      branch-misses             #    0.00% of all branches
+
+      14.531197506 seconds time elapsed
+```
+
+#### Wasm
+
+```
+ Performance counter stats for 'simple_runner demo speed_difference':
+
+       4852.352536      task-clock (msec)         #    1.000 CPUs utilized
+                 3      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+             5,611      page-faults               #    0.001 M/sec
+    14,531,719,357      cycles                    #    2.995 GHz
+    35,368,260,394      instructions              #    2.43  insn per cycle
+     4,120,953,595      branches                  #  849.269 M/sec
+         5,078,676      branch-misses             #    0.12% of all branches
+
+       4.852843140 seconds time elapsed
+```
+
+### Perf report
+
+Symbol table:
+```
+functionDef35       log
+functionDef2        __original_main
+```
+
+The global time spent in each section is very similar:
+![Time division](time-division.png)
+
+The loop bodies (pictures below) look quite similar -- unless the unknown/unknown:XX are something to worry about. The Wasm code 
+fiddles a lot with the MM registers before the loop. Manually compiling the native code with `clang++ -O3 -mavx2 -S` for comparison lead
+to the same code as shown below.
+
+![main native](main-native.png)
+
+![main Wasm](main-wasm.png)
+
+I think this is not a good experiment for general-purpose speed comparison between Wasm and Native because it seems that
+here the only difference is that MUSL's log is faster than GLibC's log. When looking at the `log` function ran in Wasm,
+it is a lot smaller than the GLibC one, which is the only explanation I have here for the speed difference.
+
+This experiment is however useful for understanding our second experiment because there is a log function taking 25% of
+the run time of the native Mersenne Twister engine experiment but there is no sign of `log` seems absent from the Wasm
+which however leads to the same result. I can only think the log was somehow optimised away?
+
+### Confirmation
+
+Compiling similar code at `-O3` natively but changing libc implementations gives the following:
+* glibc: 14.2
+* musl: 6.3
+
+Confirming our hypothesis that the musl `log` function is better.
+
+## Mersenne Twister Engine
+
+Experiment code [here](../../func/demo/mt_difference.cpp).
+
+### Perf stats
+
+#### Native
+
+
+```
+       9170.419967      task-clock (msec)         #    1.000 CPUs utilized
+                 5      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+               588      page-faults               #    0.064 K/sec
+    27,822,779,162      cycles                    #    3.034 GHz
+    17,508,333,503      instructions              #    0.63  insn per cycle
+     2,560,064,376      branches                  #  279.165 M/sec
+         1,122,999      branch-misses             #    0.04% of all branches
+
+       9.171005174 seconds time elapsed
+
+```
+
+#### Wasm
+
+```
+
+
+       1317.060034      task-clock (msec)         #    1.000 CPUs utilized
+                 1      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+             5,610      page-faults               #    0.004 M/sec
+     2,992,325,962      cycles                    #    2.272 GHz
+     6,657,716,630      instructions              #    2.22  insn per cycle
+       218,836,121      branches                  #  166.155 M/sec
+         5,013,145      branch-misses             #    2.29% of all branches
+
+       1.317535341 seconds time elapsed
+```
+
+
+###  Perf Report
+
+On the left, Wasm on the right, native. The symbol table is as follows. The
+fact we only have the `main` function in Wasm seem to indicate there is some
+inlining going on.
+```
+functionDef2        __original_main
+```
+
+![Perf report MT](mt-time.png)
+
+And indeed we can compare the loop bodies in the main function we the native simply
+calls the MT engine. And the Wasm code seem to be that same code but inlined (omitted
+because it was too long).
+
+The Native main loop uses `callq`, I'm not sure about perf's measures about the cost
+of instructions (`callq` < `mov` < `add`), and it seems clear that it doesn't include
+the time spent in `callq` because perf inverts the call graph.
+
+### Confirmation
+
+TODO confirm that:
+* Wasm code generation is more prone to inlining which helped in this loopy context
+* Wasm optimised the `logl` away, or else what happened to it?
diff --git a/docs/perf-diff/main-native.png b/docs/perf-diff/main-native.png
new file mode 100644
index 000000000..15135dfda
Binary files /dev/null and b/docs/perf-diff/main-native.png differ
diff --git a/docs/perf-diff/main-wasm.png b/docs/perf-diff/main-wasm.png
new file mode 100644
index 000000000..dd9e30d14
Binary files /dev/null and b/docs/perf-diff/main-wasm.png differ
diff --git a/docs/perf-diff/mt-time.png b/docs/perf-diff/mt-time.png
new file mode 100644
index 000000000..e84ecbd97
Binary files /dev/null and b/docs/perf-diff/mt-time.png differ
diff --git a/docs/perf-diff/time-division.png b/docs/perf-diff/time-division.png
new file mode 100644
index 000000000..3ebaac899
Binary files /dev/null and b/docs/perf-diff/time-division.png differ
diff --git a/func/demo/CMakeLists.txt b/func/demo/CMakeLists.txt
index fa3aee853..429fe1c61 100644
--- a/func/demo/CMakeLists.txt
+++ b/func/demo/CMakeLists.txt
@@ -43,6 +43,7 @@ demo_func(increment increment.cpp)
 demo_func(isatty isatty.cpp)
 demo_func(listdir listdir.cpp)
 demo_func(lock lock.c)
+demo_func(log_difference log_difference.cpp)
 demo_func(malloc malloc.cpp)
 demo_func(matrix matrix.cpp)
 demo_func(memcpy memcpy.cpp)
@@ -50,6 +51,7 @@ demo_func(memmove memmove.cpp)
 demo_func(mmap mmap.cpp)
 demo_func(mmap_big mmap_big.cpp)
 demo_func(mmap_file mmap_file.cpp)
+demo_func(mt_difference mt_difference.cpp)
 demo_func(new_obj new_obj.cpp)
 demo_func(noop noop.c)
 demo_func(optarg optarg.cpp)
diff --git a/func/demo/log_difference.cpp b/func/demo/log_difference.cpp
new file mode 100644
index 000000000..06b638054
--- /dev/null
+++ b/func/demo/log_difference.cpp
@@ -0,0 +1,16 @@
+#include <cstdio>
+#include <cstdint>
+
+const int32_t EXPECTED = -2147483529;
+
+int main() {
+    int32_t result = 100;
+    for (int32_t i = -200000000; i < 200000000; i++) {
+        result ^= ((int32_t) log(abs(i))) & ((int32_t) log(abs(i))) ;
+    }
+    if (result != EXPECTED) {
+        printf("Custom reduction failed. Expected %d but got %d\n", EXPECTED, result);
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/func/demo/mt_difference.cpp b/func/demo/mt_difference.cpp
new file mode 100644
index 000000000..3b40b8367
--- /dev/null
+++ b/func/demo/mt_difference.cpp
@@ -0,0 +1,20 @@
+#include <cstdio>
+#include <cstdint>
+#include <random>
+
+const int32_t EXPECTED = 49999327;
+
+int main() {
+    std::uniform_real_distribution<double> unif(0, 1);
+    std::mt19937_64 generator(193734715);
+    int32_t result = 0;
+
+    for (int32_t i = 0; i < 100000000; i++) {
+        result += unif(generator) < 0.5 ? 1 : 0;
+    }
+    if (result != EXPECTED) {
+        printf("Custom reduction failed. Expected %d but got %d\n", EXPECTED, result);
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/func/omp/Makefile b/func/omp/Makefile
new file mode 100644
index 000000000..183ecd1bd
--- /dev/null
+++ b/func/omp/Makefile
@@ -0,0 +1,16 @@
+# Makefile for building native_musl targets in the omp-musl container
+
+CXX=g++
+CXX_FLAGS=-mavx2 -O3
+OMP_FLAGS=-fopenmp
+OUT_DIR=/usr/bin
+
+.PHONY : all
+
+all : mt_pi docker_benchmarker
+
+mt_pi: mt_pi.cpp
+	$(CXX) $(CXX_FLAGS) $(OMP_FLAGS) $< -o $(OUT_DIR)/$@
+
+docker_benchmarker: docker_benchmarker.cpp
+	$(CXX) $(CXX_FLAGS) $< -o $(OUT_DIR)/$@
diff --git a/func/omp/docker_benchmarker.cpp b/func/omp/docker_benchmarker.cpp
new file mode 100644
index 000000000..b57e23fd5
--- /dev/null
+++ b/func/omp/docker_benchmarker.cpp
@@ -0,0 +1,69 @@
+#include <cstdio>
+#include <chrono>
+#include <fstream>
+#include <vector>
+#include <sys/stat.h>
+
+constexpr auto BINARY_DIR = "/usr/bin/";
+
+void
+nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
+          int num_threads);
+
+int main(int argc, char *argv[]) {
+
+    if (argc != 3) {
+        printf("Usage:\ndocker_benchmarker <function> <nNative>\n");
+        return 1;
+    }
+
+    std::string funcName = argv[1];
+    int nativeIterations = std::stoi(argv[2]);
+
+    std::string outfile = std::string("/wasm/omp/") + funcName + "/bench.csv";
+
+    std::ofstream profOut;
+    profOut.open(outfile);
+    profOut << "iterations,numThreads,type,microseconds" << std::endl;
+
+    std::string execPath = std::string(BINARY_DIR) + funcName;
+
+    std::vector<long> iterations = {200000l, 20000000l, 200000000l};
+    std::vector<std::string> iter_names = {"Tiny,", "Small,", "Big,"};
+
+    for (int run = 1; run <= nativeIterations; run++) {
+        printf("NATIVE - %s (%d/%d)\n", funcName.c_str(), run, nativeIterations);
+        for (size_t i = 0; i < iterations.size(); i++) {
+            nativeRun(profOut, execPath, iterations[i], iter_names[i], 1);
+            for (int num_threads = 2; num_threads < 25; num_threads += 2) {
+                nativeRun(profOut, execPath, iterations[i], iter_names[i], num_threads);
+            }
+        }
+    }
+
+    profOut.flush();
+    profOut.close();
+
+    chmod(outfile.c_str(),S_IRWXU | S_IRWXG | S_IRWXO);
+
+    return 0;
+}
+
+
+void
+nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
+          int num_threads) {
+
+    char buffer [128];
+    sprintf(buffer, "%s %d %ld", execPath.c_str(), num_threads, num_iterations);
+
+    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
+    int error = system(buffer);
+    const int nativeTime = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - t1).count();
+
+    if (error != 0) {
+        printf("Failed to execute %s", buffer);
+    }
+
+    profOut << iteration_name << num_threads << ",native," << nativeTime << std::endl;
+}
diff --git a/func/omp/mt_pi.cpp b/func/omp/mt_pi.cpp
index 92f5afc26..4731c0e52 100644
--- a/func/omp/mt_pi.cpp
+++ b/func/omp/mt_pi.cpp
@@ -2,8 +2,6 @@
 #include <cstdio>
 #include <random>
 
-//#define num_devices num_threads
-
 unsigned long thread_seed() {
     int threadNum = omp_get_thread_num();
     return threadNum * threadNum * 77 - 22 * threadNum + 1927;
@@ -14,13 +12,11 @@ int main(int argc, char **argv) {
     int num_threads = 1;
     if (argc == 3) {
         num_threads = std::stoi(argv[1]);
-        iterations = std::stol(argv[2]);
+        iterations = std::stoll(argv[2]);
     } else if (argc != 1) {
         printf("Usage: mt_pi [num_threads num_iterations]");
     }
 
-//    omp_set_default_device(-2);
-
     long long result = 0;
     #pragma omp parallel num_threads(num_threads) default(none) firstprivate(iterations) reduction(+:result)
     {
@@ -42,5 +38,4 @@ int main(int argc, char **argv) {
     if (pi - 3.14 > 0.01) {
         printf("Low accuracy. Expected pi got %f\n", pi);
     }
-    return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/include/wasm/openmp/ThreadState.h b/include/wasm/openmp/ThreadState.h
index 148a1c221..820867bbf 100644
--- a/include/wasm/openmp/ThreadState.h
+++ b/include/wasm/openmp/ThreadState.h
@@ -1,5 +1,4 @@
-#ifndef FAASM_THREADSTATE_H
-#define FAASM_THREADSTATE_H
+#pragma once
 
 #include <wasm/openmp/Level.h>
 
@@ -14,5 +13,3 @@ namespace wasm {
         void setThreadLevel(std::shared_ptr<OMPLevel>);
     }
 }
-
-#endif //FAASM_THREADSTATE_H
diff --git a/src/runner/bench_omp.cpp b/src/runner/bench_omp.cpp
index e34267a51..fcdaa9b7a 100644
--- a/src/runner/bench_omp.cpp
+++ b/src/runner/bench_omp.cpp
@@ -6,107 +6,106 @@
 #include <util/logging.h>
 #include <util/timing.h>
 
-constexpr auto BINARY_DIR = "/usr/local/code/faasm/ninja-build/bin/";
+constexpr auto DOCKER_CMD = "/usr/bin/docker run --cpus=20 -v /usr/local/code/faasm/wasm:/wasm --rm faasm/omp-musl:0.1.3 docker_benchmarker ";
 
-void
-nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
-          int num_threads);
+void nativeRun(const std::string &funcName, int numRuns);
 
-void wasmRun(std::ofstream &profOut, message::Message &call, long num_iterations, const std::string &iteration_name,
-             int num_threads, wasm::WAVMWasmModule module);
+void wasmRun(const std::string &user, const std::string &funcName, int wasmIterations, const std::string &outfile);
+void wasmRunIteration(std::ofstream &profOut, message::Message &call, long num_iterations, const std::string &iteration_name,
+                      int num_threads, wasm::WAVMWasmModule module);
 
 int main(int argc, char *argv[]) {
     util::initLogging();
     const std::shared_ptr<spdlog::logger> &logger = util::getLogger();
 
-    if (argc < 4) {
-        logger->error("Usage:\nbench_func <user> <function> <nNative> <nWasm>");
+    if (argc != 4) {
+        logger->error("Usage:\nbench_omp <function> <nNative> <nWasm>");
         return 1;
     }
 
-    std::string user = argv[1];
-    std::string funcName = argv[2];
-    int nativeIterations = std::stoi(argv[3]);
-    int wasmIterations = std::stoi(argv[4]);
+    std::string user = "omp";
+    std::string funcName = argv[1];
+    int nativeIterations = std::stoi(argv[2]);
+    int wasmIterations = std::stoi(argv[3]);
+
+    std::string outfile = std::string("/usr/local/code/faasm/wasm/omp/")  + funcName + "/bench.csv";
+    // Clean up for container (and ensuring we will be writing to the same file
+    boost::filesystem::remove(outfile);
 
     logger->info("Benchmarking {} ({}x native and {}x wasm)", funcName, nativeIterations, wasmIterations);
 
-    std::string outfile = std::string("/usr/local/code/faasm/wasm/") + user + "/" + funcName + "/bench.csv";
+    nativeRun(funcName, nativeIterations);
+    wasmRun(user, funcName, wasmIterations, outfile);
 
-    std::ofstream profOut;
-    profOut.open(outfile);
-    profOut << "iterations,numThreads,type,microseconds" << std::endl;
+    logger->info("Finished benchmark - {}", funcName);
+    return 0;
+}
+
+void nativeRun(const std::string &funcName, int numRuns) {
+    auto logger = util::getLogger();
+    const std::string execCmd = fmt::format("{} {} {}", DOCKER_CMD, funcName, numRuns);
 
-    std::string execPath = std::string(BINARY_DIR) + funcName;
+    logger->info("Exectuting native");
 
-    logger->info("Running benchmark natively");
-    if (!boost::filesystem::exists(execPath)) {
-        throw std::runtime_error("Could not find binary at " + execPath);
+    const util::TimePoint nativeTp = util::startTimer();
+    const int error = system(execCmd.c_str());
+    const long nativeTime = util::getTimeDiffMillis(nativeTp);
+
+    logger->info("Done executing native in {} ms", nativeTime);
+
+    if (error != 0) {
+        logger->error("Failed to execute %s", execCmd.c_str());
     }
+}
 
-    std::vector<long> iterations = {200000l, 20000000l, 200000000l};
-    std::vector<std::string> iter_names = {"Tiny,", "Small,", "Big,"};
+void wasmRun(const std::string &user, const std::string &funcName, int wasmIterations, const std::string &outfile) {
 
-    for (int run = 1; run <= nativeIterations; run++) {
-        logger->info("NATIVE - {} ({}/{})", funcName, run, nativeIterations);
-        for (size_t i = 0; i < iterations.size(); i++) {
-            nativeRun(profOut, execPath, iterations[i], iter_names[i], 1);
-            for (int num_threads = 2; num_threads < 25; num_threads += 2) {
-                nativeRun(profOut, execPath, iterations[i], iter_names[i], num_threads);
-            }
-        }
+    const std::shared_ptr<spdlog::logger> &logger = util::getLogger();
+
+    std::ofstream profOut;
+    profOut.open(outfile, std::fstream::app);
+
+    logger->info("Running Wasm benchmark");
+    if (!boost::filesystem::exists(outfile)) {
+        throw std::runtime_error("Could not find native benchmark output at " + outfile);
     }
 
+    std::vector<long> iterations = {200000l, 20000000l, 200000000l};
+    std::vector<std::string> iter_names = {"Tiny,", "Small,", "Big,"};
+
     message::Message call = util::messageFactory(user, funcName);
     module_cache::WasmModuleCache &moduleCache = module_cache::getWasmModuleCache();
     wasm::WAVMWasmModule &cachedModule = moduleCache.getCachedModule(call);
 
+    const util::TimePoint wasmTp = util::startTimer();
+
     for (int run = 1; run <= wasmIterations; run++) {
-        logger->info("WASM - {} ({}/{})", funcName, run, nativeIterations);
+        logger->info("WASM - {} ({}/{})", funcName, run, wasmIterations);
         for (size_t i = 0; i < iterations.size(); i++) {
-            wasmRun(profOut, call, iterations[i], iter_names[i], 1, cachedModule);
+            wasmRunIteration(profOut, call, iterations[i], iter_names[i], 1, cachedModule);
             for (int num_threads = 2; num_threads < 25; num_threads += 2) {
-                wasmRun(profOut, call, iterations[i], iter_names[i], num_threads, cachedModule);
+                wasmRunIteration(profOut, call, iterations[i], iter_names[i], num_threads, cachedModule);
             }
         }
     }
 
+    const long wasmTime = util::getTimeDiffMillis(wasmTp);
+    logger->info("Done executing native in {} ms", wasmTime);
 
-    logger->info("Finished benchmark - {}", funcName);
 
     profOut.flush();
     profOut.close();
-
-    return 0;
 }
 
-void wasmRun(std::ofstream &profOut, message::Message &call, long num_iterations, const std::string &iteration_name,
+void wasmRunIteration(std::ofstream &profOut, message::Message &call, long num_iterations, const std::string &iteration_name,
              int num_threads, wasm::WAVMWasmModule cachedModule) {
     auto args = fmt::format("{} {}", num_threads, num_iterations);
     call.set_cmdline(args.c_str());
-    const util::TimePoint wasmTp = util::startTimer();
 
+    const util::TimePoint iterationTp = util::startTimer();
     wasm::WAVMWasmModule module{cachedModule};
-
     module.execute(call);
+    const long wasmIterationTime = util::getTimeDiffMicros(iterationTp);
 
-    const long nativeTime = util::getTimeDiffMicros(wasmTp);
-
-    profOut << iteration_name << num_threads << ",wasm," << nativeTime << std::endl;
-}
-
-void
-nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
-          int num_threads) {
-    const std::string execCmd = fmt::format("{} {} {}", execPath, num_threads, num_iterations);
-
-    const util::TimePoint wasmTp = util::startTimer();
-    const int error = system(execCmd.c_str());
-    const long nativeTime = util::getTimeDiffMicros(wasmTp);
-
-    if (error != 0) {
-        printf("Failed to execute %s", execCmd.c_str());
-    }
-
-    profOut << iteration_name << num_threads << ",native," << nativeTime << std::endl;
+    profOut << iteration_name << num_threads << ",wasm," << wasmIterationTime << std::endl;
 }
diff --git a/tasks/compile.py b/tasks/compile.py
index be048b12e..bb3063279 100644
--- a/tasks/compile.py
+++ b/tasks/compile.py
@@ -1,4 +1,4 @@
-from os import mkdir, listdir
+from os import makedirs, listdir
 from os.path import exists, join, splitext
 from shutil import copy
 from subprocess import call
@@ -15,8 +15,7 @@
 def _copy_built_function(user, func):
     dest_folder = join(WASM_DIR, user, func)
 
-    if not exists(dest_folder):
-        mkdir(dest_folder)
+    makedirs(dest_folder, exist_ok=True)
 
     src_file = join(FUNC_BUILD_DIR, user, ".".join([func, "wasm"]))
     dest_file = join(dest_folder, "function.wasm")