Files
clang-p2996/compiler-rt/lib/fuzzer/scripts/collect_data_flow.py
Max Moroz b6e6d3c740 [libFuzzer] Fix DataFlow.cpp logic when tracing long inputs.
Summary:
1. Do not create DFSan labels for the bytes which we do not trace. This is where we run out of labels at the first place.
2. When dumping the traces on the disk, make sure to offset the label identifiers by the number of the first byte in the trace range.
3. For the last label, make sure to write it at the last position of the trace bit string, as that label represents the input size, not any particular byte.

Also fixed the bug with division in python which I've introduced when migrated the scripts to Python3 (`//` is required for integral division).

Otherwise, the scripts are wasting too much time unsuccessfully trying to
collect and process traces from the long inputs. For more context, see
https://github.com/google/oss-fuzz/issues/1632#issuecomment-481761789

Reviewers: kcc

Reviewed By: kcc

Subscribers: delcypher, #sanitizers, llvm-commits

Tags: #llvm, #sanitizers

Differential Revision: https://reviews.llvm.org/D60538

llvm-svn: 358311
2019-04-12 21:00:12 +00:00

81 lines
2.7 KiB
Python
Executable File

#!/usr/bin/env python3
#===- lib/fuzzer/scripts/collect_data_flow.py ------------------------------===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
# Runs the data-flow tracer several times on the same input in order to collect
# the complete trace for all input bytes (running it on all bytes at once
# may fail if DFSan runs out of labels).
# Usage:
#
# # Collect dataflow for one input, store it in OUTPUT (default is stdout)
# collect_data_flow.py BINARY INPUT [OUTPUT]
#
# # Collect dataflow for all inputs in CORPUS_DIR, store them in OUTPUT_DIR
# collect_data_flow.py BINARY CORPUS_DIR OUTPUT_DIR
#===------------------------------------------------------------------------===#
import atexit
import hashlib
import sys
import os
import subprocess
import tempfile
import shutil
tmpdir = ""
def cleanup(d):
print("removing: %s" % d)
shutil.rmtree(d)
def collect_dataflow_for_corpus(self, exe, corpus_dir, output_dir):
print("Collecting dataflow for corpus: %s output_dir: %s" % (corpus_dir,
output_dir))
assert not os.path.exists(output_dir)
os.mkdir(output_dir)
for root, dirs, files in os.walk(corpus_dir):
for f in files:
path = os.path.join(root, f)
with open(path, 'rb') as fh:
data = fh.read()
sha1 = hashlib.sha1(data).hexdigest()
output = os.path.join(output_dir, sha1)
subprocess.call([self, exe, path, output])
functions_txt = open(os.path.join(output_dir, "functions.txt"), "w")
subprocess.call([exe], stdout=functions_txt)
def main(argv):
exe = argv[1]
inp = argv[2]
if os.path.isdir(inp):
return collect_dataflow_for_corpus(argv[0], exe, inp, argv[3])
size = os.path.getsize(inp)
q = [[0, size]]
tmpdir = tempfile.mkdtemp(prefix="libfuzzer-tmp-")
atexit.register(cleanup, tmpdir)
print("tmpdir: ", tmpdir)
outputs = []
while len(q):
r = q.pop()
print("******* Trying: ", r)
tmpfile = os.path.join(tmpdir, str(r[0]) + "-" + str(r[1]))
ret = subprocess.call([exe, str(r[0]), str(r[1]), inp, tmpfile])
if ret and r[1] - r[0] >= 2:
q.append([r[0], (r[1] + r[0]) // 2])
q.append([(r[1] + r[0]) // 2, r[1]])
else:
outputs.append(tmpfile)
print("******* Success: ", r)
f = sys.stdout
if len(argv) >= 4:
f = open(argv[3], "w")
merge = os.path.join(os.path.dirname(argv[0]), "merge_data_flow.py")
subprocess.call([merge] + outputs, stdout=f)
if __name__ == '__main__':
main(sys.argv)