From 94725b9814db34a9249fa1c172be845d41dbbdc7 Mon Sep 17 00:00:00 2001 From: q9982 <980754729@qq.com> Date: Wed, 13 May 2026 17:49:54 +0800 Subject: [PATCH] Fix corpus archiving to avoid `tarfile.ReadError: unexpected end of data` during experiments when files change while the tar archive is being created. Corpus files can still change while the runner is creating a tar archive. Adding them directly from disk can therefore write a tar header for one size and then read a different amount of data, producing a truncated tar member. Stage regular files in a temporary file before writing them to the tar archive. Non-regular entries only need their tar header, so avoid reading file contents for them. --- experiment/runner.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/experiment/runner.py b/experiment/runner.py index b8c95fcca..f9308554f 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -23,6 +23,7 @@ import subprocess import sys import tarfile +import tempfile import threading import time import zipfile @@ -392,9 +393,10 @@ def archive_corpus(self): last_modified_time = stat_info.st_mtime if last_modified_time <= self.last_archive_time: continue # We've saved this file already. - new_archive_time = max(new_archive_time, last_modified_time) arcname = os.path.relpath(file_path, self.output_corpus) - tar.add(file_path, arcname=arcname) + if _add_corpus_file_to_archive(tar, file_path, arcname): + new_archive_time = max(new_archive_time, + last_modified_time) except (FileNotFoundError, OSError): # We will get these errors if files or directories are being # deleted from |directory| as we archive it. Don't bother @@ -451,6 +453,26 @@ def get_fuzzer_module(fuzzer): return fuzzer_module +def _add_corpus_file_to_archive(tar, file_path, arcname): + """Add |file_path| to |tar| without writing a truncated tar member.""" + tarinfo = tar.gettarinfo(file_path, arcname=arcname) + if tarinfo is None: + return False + + if not tarinfo.isreg(): + tar.addfile(tarinfo) + return True + + with tempfile.SpooledTemporaryFile( + max_size=CORPUS_ELEMENT_BYTES_LIMIT) as staged_file: + with open(file_path, 'rb') as file_handle: + shutil.copyfileobj(file_handle, staged_file) + tarinfo.size = staged_file.tell() + staged_file.seek(0) + tar.addfile(tarinfo, staged_file) + return True + + def get_corpus_elements(corpus_dir): """Returns a list of absolute paths to corpus elements in |corpus_dir|.""" corpus_dir = os.path.abspath(corpus_dir)