Summary: 1. Do not create DFSan labels for the bytes which we do not trace. This is where we run out of labels at the first place. 2. When dumping the traces on the disk, make sure to offset the label identifiers by the number of the first byte in the trace range. 3. For the last label, make sure to write it at the last position of the trace bit string, as that label represents the input size, not any particular byte. Also fixed the bug with division in python which I've introduced when migrated the scripts to Python3 (`//` is required for integral division). Otherwise, the scripts are wasting too much time unsuccessfully trying to collect and process traces from the long inputs. For more context, see https://github.com/google/oss-fuzz/issues/1632#issuecomment-481761789 Reviewers: kcc Reviewed By: kcc Subscribers: delcypher, #sanitizers, llvm-commits Tags: #llvm, #sanitizers Differential Revision: https://reviews.llvm.org/D60538 llvm-svn: 358311
81 lines
2.7 KiB
Python
Executable File
81 lines
2.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#===- lib/fuzzer/scripts/collect_data_flow.py ------------------------------===#
|
|
#
|
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
# See https://llvm.org/LICENSE.txt for license information.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
#
|
|
#===------------------------------------------------------------------------===#
|
|
# Runs the data-flow tracer several times on the same input in order to collect
|
|
# the complete trace for all input bytes (running it on all bytes at once
|
|
# may fail if DFSan runs out of labels).
|
|
# Usage:
|
|
#
|
|
# # Collect dataflow for one input, store it in OUTPUT (default is stdout)
|
|
# collect_data_flow.py BINARY INPUT [OUTPUT]
|
|
#
|
|
# # Collect dataflow for all inputs in CORPUS_DIR, store them in OUTPUT_DIR
|
|
# collect_data_flow.py BINARY CORPUS_DIR OUTPUT_DIR
|
|
#===------------------------------------------------------------------------===#
|
|
import atexit
|
|
import hashlib
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import shutil
|
|
|
|
tmpdir = ""
|
|
|
|
def cleanup(d):
|
|
print("removing: %s" % d)
|
|
shutil.rmtree(d)
|
|
|
|
def collect_dataflow_for_corpus(self, exe, corpus_dir, output_dir):
|
|
print("Collecting dataflow for corpus: %s output_dir: %s" % (corpus_dir,
|
|
output_dir))
|
|
assert not os.path.exists(output_dir)
|
|
os.mkdir(output_dir)
|
|
for root, dirs, files in os.walk(corpus_dir):
|
|
for f in files:
|
|
path = os.path.join(root, f)
|
|
with open(path, 'rb') as fh:
|
|
data = fh.read()
|
|
sha1 = hashlib.sha1(data).hexdigest()
|
|
output = os.path.join(output_dir, sha1)
|
|
subprocess.call([self, exe, path, output])
|
|
functions_txt = open(os.path.join(output_dir, "functions.txt"), "w")
|
|
subprocess.call([exe], stdout=functions_txt)
|
|
|
|
|
|
def main(argv):
|
|
exe = argv[1]
|
|
inp = argv[2]
|
|
if os.path.isdir(inp):
|
|
return collect_dataflow_for_corpus(argv[0], exe, inp, argv[3])
|
|
size = os.path.getsize(inp)
|
|
q = [[0, size]]
|
|
tmpdir = tempfile.mkdtemp(prefix="libfuzzer-tmp-")
|
|
atexit.register(cleanup, tmpdir)
|
|
print("tmpdir: ", tmpdir)
|
|
outputs = []
|
|
while len(q):
|
|
r = q.pop()
|
|
print("******* Trying: ", r)
|
|
tmpfile = os.path.join(tmpdir, str(r[0]) + "-" + str(r[1]))
|
|
ret = subprocess.call([exe, str(r[0]), str(r[1]), inp, tmpfile])
|
|
if ret and r[1] - r[0] >= 2:
|
|
q.append([r[0], (r[1] + r[0]) // 2])
|
|
q.append([(r[1] + r[0]) // 2, r[1]])
|
|
else:
|
|
outputs.append(tmpfile)
|
|
print("******* Success: ", r)
|
|
f = sys.stdout
|
|
if len(argv) >= 4:
|
|
f = open(argv[3], "w")
|
|
merge = os.path.join(os.path.dirname(argv[0]), "merge_data_flow.py")
|
|
subprocess.call([merge] + outputs, stdout=f)
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv)
|