diff --git a/.github/workflows/bun_build.yaml b/.github/workflows/bun_build.yaml index 3b9085ae47df..0478a4b15e5c 100644 --- a/.github/workflows/bun_build.yaml +++ b/.github/workflows/bun_build.yaml @@ -41,6 +41,48 @@ jobs: if: matrix.safe == 'true' - run: sed -i 's/max_rss = 7_800_000_000/max_rss = 10_000_000_000/' zig/build.zig if: matrix.safe == 'true' + - name: Fetch mimalloc + uses: actions/checkout@v4 + with: + repository: oven-sh/mimalloc + ref: bun-dev3-v2 + path: mimalloc + - name: Splice mimalloc into bootstrap build (linux-musl) + # musl's malloc has a single global rwlock; with N parallel LLVM + # contexts (--llvm-codegen-threads=N) every `operator new` serialises + # on it. mimalloc has per-thread heaps. Compile its unity-build + # static.c (with MI_OVERRIDE so it replaces malloc/free) for the + # target after the host zig exists, then link the object into the + # final cross-compiled zig. Inserted just before the final + # `$ZIG build` (the `cd "$ROOTDIR/zig"` line) so $ZIG is available. + if: contains(matrix.target, 'linux-musl') + run: | + sed -i '/^cd "\$ROOTDIR\/zig"$/i \ + $ZIG cc -c "$ROOTDIR/mimalloc/src/static.c" \\\ + -I "$ROOTDIR/mimalloc/include" \\\ + -target $TARGET -mcpu=$MCPU -O2 -fno-builtin -DNDEBUG -Wno-date-time \\\ + -DMI_MALLOC_OVERRIDE=1 -DMI_LIBC_MUSL=1 -DMI_STATIC_LIB \\\ + -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\ + ' build + sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build + - name: Splice mimalloc into bootstrap build (windows-gnu) + # Windows CRT routes operator new -> HeapAlloc(GetProcessHeap()), + # which is guarded by a single critical section — same parallel-emit + # serialisation as musl. malloc/free can't be statically interposed + # on Windows, but C++ operator new/delete are replaceable per the + # standard, and LLVM's hot allocations go through them. Compile the + # override TU as C++ so the global operators are emitted; mimalloc + # itself is compiled-in via #include of its unity-build static.c. + if: contains(matrix.target, 'windows-gnu') + run: | + sed -i '/^cd "\$ROOTDIR\/zig"$/i \ + $ZIG c++ -c "$ROOTDIR/zig/tools/mimalloc_new_delete_override.cpp" \\\ + -I "$ROOTDIR/mimalloc/include" -I "$ROOTDIR/mimalloc" \\\ + -target $TARGET -mcpu=$MCPU -std=c++17 -O2 -fno-builtin \\\ + -DNDEBUG -Wno-date-time -DMI_STATIC_LIB \\\ + -o "$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"\ + ' build + sed -i 's#-Dversion-string="$ZIG_VERSION"#-Dversion-string="$ZIG_VERSION" -Dmimalloc-obj="$ROOTDIR/out/mimalloc-$TARGET-$MCPU.o"#' build - run: cat build - name: Cache host toolchain uses: actions/cache@v4 diff --git a/.gitignore b/.gitignore index 7e9e15820297..5fb4854a4a33 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ zig-out/ # Although this was renamed to .zig-cache, let's leave it here for a few # releases to make it less annoying to work with multiple branches. zig-cache/ +bun-cache/ diff --git a/build.zig b/build.zig index 745d8070f83a..c8ed8ad21a72 100644 --- a/build.zig +++ b/build.zig @@ -98,6 +98,7 @@ pub fn build(b: *std.Build) !void { const skip_macos = b.option(bool, "skip-macos", "Main test suite skips targets with macos OS") orelse false; const skip_linux = b.option(bool, "skip-linux", "Main test suite skips targets with linux OS") orelse false; const skip_llvm = b.option(bool, "skip-llvm", "Main test suite skips targets that use LLVM backend") orelse false; + const llvm_codegen_threads = b.option(u32, "llvm-codegen-threads", "Number of LLVM codegen threads to use for module tests") orelse 0; const only_install_lib_files = b.option(bool, "lib-files-only", "Only install library files") orelse false; @@ -123,6 +124,19 @@ pub fn build(b: *std.Build) !void { "llvm-has-xtensa", "Whether LLVM has the experimental target xtensa enabled", ) orelse false; + const llvm_has_polly = b.option( + bool, + "llvm-has-polly", + "Whether LLVM was built with Polly and requires linking it", + ) orelse false; + const mimalloc_obj = b.option( + []const u8, + "mimalloc-obj", + "Path to a mimalloc static.c object built with MI_OVERRIDE; linked " ++ + "into the compiler so libc malloc (musl's single-lock allocator " ++ + "in static builds) is replaced. LLVM emit at high codegen-thread " ++ + "counts otherwise serialises on the malloc lock.", + ); const enable_ios_sdk = b.option(bool, "enable-ios-sdk", "Run tests requiring presence of iOS SDK and frameworks") orelse false; const enable_macos_sdk = b.option(bool, "enable-macos-sdk", "Run tests requiring presence of macOS SDK and frameworks") orelse enable_ios_sdk; const enable_symlinks_windows = b.option(bool, "enable-symlinks-windows", "Run tests requiring presence of symlinks on Windows") orelse false; @@ -202,6 +216,7 @@ pub fn build(b: *std.Build) !void { }); exe.pie = pie; exe.entitlements = entitlements; + if (mimalloc_obj) |p| exe.addObjectFile(.{ .cwd_relative = p }); const use_llvm = b.option(bool, "use-llvm", "Use the llvm backend"); exe.use_llvm = use_llvm; @@ -332,6 +347,7 @@ pub fn build(b: *std.Build) !void { .llvm_has_csky = llvm_has_csky, .llvm_has_arc = llvm_has_arc, .llvm_has_xtensa = llvm_has_xtensa, + .llvm_has_polly = llvm_has_polly, }); } if (target.result.os.tag == .windows) { @@ -461,6 +477,7 @@ pub fn build(b: *std.Build) !void { .skip_linux = skip_linux, .skip_llvm = skip_llvm, .skip_libc = skip_libc, + .llvm_codegen_threads = llvm_codegen_threads, // 3888779264 was observed on an x86_64-linux-gnu host. .max_rss = 4000000000, })); @@ -483,6 +500,7 @@ pub fn build(b: *std.Build) !void { .skip_linux = skip_linux, .skip_llvm = skip_llvm, .skip_libc = skip_libc, + .llvm_codegen_threads = llvm_codegen_threads, })); test_modules_step.dependOn(tests.addModuleTests(b, .{ @@ -545,6 +563,7 @@ pub fn build(b: *std.Build) !void { .skip_linux = skip_linux, .skip_llvm = skip_llvm, .skip_libc = skip_libc, + .llvm_codegen_threads = llvm_codegen_threads, // I observed a value of 5605064704 on the M2 CI. .max_rss = 6165571174, })); @@ -739,7 +758,7 @@ fn addCompilerMod(b: *std.Build, options: AddCompilerModOptions) *std.Build.Modu fn addCompilerStep(b: *std.Build, options: AddCompilerModOptions) *std.Build.Step.Compile { const exe = b.addExecutable(.{ .name = "zig", - .max_rss = 10_000_000_000, + .max_rss = 11_000_000_000, .root_module = addCompilerMod(b, options), }); exe.stack_size = stack_size; @@ -858,6 +877,7 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct { llvm_has_csky: bool, llvm_has_arc: bool, llvm_has_xtensa: bool, + llvm_has_polly: bool, }) !void { // Adds the Zig C++ sources which both stage1 and stage2 need. // @@ -898,6 +918,10 @@ fn addStaticLlvmOptionsToModule(mod: *std.Build.Module, options: struct { mod.linkSystemLibrary(lib_name, .{}); }; + if (options.llvm_has_polly) for (llvm_libs_polly) |lib_name| { + mod.linkSystemLibrary(lib_name, .{}); + }; + mod.linkSystemLibrary("z", .{}); mod.linkSystemLibrary("zstd", .{}); @@ -1419,6 +1443,10 @@ const llvm_libs_xtensa = [_][]const u8{ "LLVMXtensaDesc", "LLVMXtensaInfo", }; +const llvm_libs_polly = [_][]const u8{ + "Polly", + "PollyISL", +}; fn generateLangRef(b: *std.Build) std.Build.LazyPath { const doctest_exe = b.addExecutable(.{ diff --git a/lib/std/Build/Step/Compile.zig b/lib/std/Build/Step/Compile.zig index fc23d2da389a..ee2c353010a1 100644 --- a/lib/std/Build/Step/Compile.zig +++ b/lib/std/Build/Step/Compile.zig @@ -159,9 +159,16 @@ dead_strip_dylibs: bool = false, /// Number of threads to use for LLVM backend code generation. /// 0 means single-threaded (default). > 1 enables parallel codegen. -/// When enabled, outputs multiple .o files: filename.0.o, filename.1.o, etc. +/// When enabled, outputs multiple object files: filename.0.o, filename.1.o, etc. +/// (or .obj on COFF targets). llvm_codegen_threads: u32 = 0, +/// Skip the relocatable -r merge of partitioned LLVM output. The shard +/// objects are emitted directly to `{emit}.{i}.o` (or `.obj` on COFF) for +/// the downstream linker to consume. Only meaningful when +/// `llvm_codegen_threads > 1`. +llvm_no_merge_shards: bool = false, + /// Skip linker step for build-obj - outputs raw LLVM object file(s). /// Saves time by avoiding parse/resolve/write cycle. no_link_obj: bool = false, @@ -893,6 +900,35 @@ pub fn getEmittedBin(compile: *Compile) LazyPath { return compile.getEmittedFileGeneric(&compile.generated_bin); } +/// Returns the per-shard object paths when `llvm_no_merge_shards` is set. +/// Shard `i` lives at `{dir}/{stem}.{i}{ext}` where `dir` is the emitted-bin +/// directory, `stem` is `out_filename` with the target's object extension +/// stripped, and `ext` is that extension (`.o` for ELF/Mach-O, `.obj` for +/// COFF). The returned slice has `llvm_codegen_threads` entries, allocated +/// from the build arena. +/// +/// Intended use: `addObject` is configured with `llvm_codegen_threads > 1` +/// and `llvm_no_merge_shards = true`; the consumer (an executable's link +/// step, or `addInstallFile`) iterates this slice instead of calling +/// `getEmittedBin()` (which points at a stub the compiler deletes). +pub fn getEmittedBinShards(compile: *Compile) []std.Build.LazyPath { + assert(compile.llvm_no_merge_shards); + assert(compile.llvm_codegen_threads > 1); + const b = compile.step.owner; + const dir = compile.getEmittedBinDirectory(); + const target = compile.rootModuleTarget(); + const obj_ext = target.ofmt.fileExt(target.cpu.arch); + const stem = if (std.mem.endsWith(u8, compile.out_filename, obj_ext)) + compile.out_filename[0 .. compile.out_filename.len - obj_ext.len] + else + compile.out_filename; + const out = b.allocator.alloc(std.Build.LazyPath, compile.llvm_codegen_threads) catch @panic("OOM"); + for (out, 0..) |*p, i| { + p.* = dir.path(b, b.fmt("{s}.{d}{s}", .{ stem, i, obj_ext })); + } + return out; +} + /// Returns the path to the generated import library. /// This function can only be called for libraries. pub fn getEmittedImplib(compile: *Compile) LazyPath { @@ -1532,6 +1568,9 @@ fn getZigArgs(compile: *Compile, fuzz: bool) ![][]const u8 { if (compile.llvm_codegen_threads > 0) { try zig_args.append(b.fmt("--llvm-codegen-threads={d}", .{compile.llvm_codegen_threads})); } + if (compile.llvm_no_merge_shards) { + try zig_args.append("--llvm-no-merge-shards"); + } if (compile.no_link_obj) { try zig_args.append("--no-link"); } diff --git a/lib/std/Thread/Condition.zig b/lib/std/Thread/Condition.zig index 91974a44b4ab..e02c185822f8 100644 --- a/lib/std/Thread/Condition.zig +++ b/lib/std/Thread/Condition.zig @@ -107,10 +107,14 @@ pub fn broadcast(self: *Condition) void { self.impl.wake(.all); } +// FutexImpl is used everywhere, including Windows. WindowsImpl wraps the +// kernel CONDITION_VARIABLE which has no userspace "no waiters" fast-path — +// every wake() is a kernel32 call. Under heavily-signalled condvars (e.g. +// the compiler's per-job work_queue_cond.signal()) this dominates wall time +// at high thread counts. FutexImpl checks `wakeable == 0` in userspace +// first; on Windows the underlying Futex maps to RtlWaitOnAddress (Win8+). const Impl = if (builtin.single_threaded) SingleThreadedImpl -else if (builtin.os.tag == .windows) - WindowsImpl else FutexImpl; diff --git a/lib/std/c.zig b/lib/std/c.zig index 331b8b0d2c39..593b42380e47 100644 --- a/lib/std/c.zig +++ b/lib/std/c.zig @@ -11371,7 +11371,7 @@ const private = struct { extern "c" fn getentropy(buffer: [*]u8, size: usize) c_int; extern "c" fn arc4random_buf(buf: [*]u8, len: usize) void; - extern "c" fn _msize(?*const anyopaque) usize; + extern "c" fn _msize(?*anyopaque) usize; extern "c" fn malloc_size(?*const anyopaque) usize; extern "c" fn malloc_usable_size(?*const anyopaque) usize; extern "c" fn posix_memalign(memptr: *?*anyopaque, alignment: usize, size: usize) c_int; diff --git a/lib/std/fs/Dir.zig b/lib/std/fs/Dir.zig index d3c0c3d21583..5cbe45132398 100644 --- a/lib/std/fs/Dir.zig +++ b/lib/std/fs/Dir.zig @@ -1309,8 +1309,12 @@ pub fn realpath(self: Dir, pathname: []const u8, out_buffer: []u8) RealPathError } if (native_os == .windows) { if (pathname.len == 1 and pathname[0] == '.') { - const ptr: *[std.fs.max_path_bytes]u8 = out_buffer[0..std.fs.max_path_bytes]; - return try std.os.getFdPath(self.fd, ptr); + var buffer: [fs.max_path_bytes]u8 = undefined; + const out_path = try std.os.getFdPath(self.fd, &buffer); + if (out_path.len > out_buffer.len) return error.NameTooLong; + const result = out_buffer[0..out_path.len]; + @memcpy(result, out_path); + return result; } const pathname_w = try windows.sliceToPrefixedFileW(self.fd, pathname); return self.realpathW(pathname_w.span(), out_buffer); diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 083453d1d211..b9873359cc80 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -1651,7 +1651,7 @@ fn init_vdso_clock_gettime(clk: clockid_t, ts: *timespec) callconv(.c) usize { pub fn clock_getres(clk_id: clockid_t, tp: *timespec) usize { return syscall2( if (@hasField(SYS, "clock_getres")) .clock_getres else .clock_getres_time64, - @as(usize, @bitCast(@as(isize, clk_id))), + @as(usize, @bitCast(@as(isize, @intFromEnum(clk_id)))), @intFromPtr(tp), ); } @@ -1659,7 +1659,7 @@ pub fn clock_getres(clk_id: clockid_t, tp: *timespec) usize { pub fn clock_settime(clk_id: clockid_t, tp: *const timespec) usize { return syscall2( if (@hasField(SYS, "clock_settime")) .clock_settime else .clock_settime64, - @as(usize, @bitCast(@as(isize, clk_id))), + @as(usize, @bitCast(@as(isize, @intFromEnum(clk_id)))), @intFromPtr(tp), ); } diff --git a/lib/std/zig/llvm/Builder.zig b/lib/std/zig/llvm/Builder.zig index 61c4d1b44e3a..dd35ab617b12 100644 --- a/lib/std/zig/llvm/Builder.zig +++ b/lib/std/zig/llvm/Builder.zig @@ -42,6 +42,7 @@ next_unique_global_id: std.AutoHashMapUnmanaged(StrtabString, u32), aliases: std.ArrayListUnmanaged(Alias), variables: std.ArrayListUnmanaged(Variable), functions: std.ArrayListUnmanaged(Function), +comdats: std.ArrayListUnmanaged(Comdat), strtab_string_map: std.AutoArrayHashMapUnmanaged(void, void), strtab_string_indices: std.ArrayListUnmanaged(u32), @@ -2513,12 +2514,37 @@ pub const Alias = struct { }; }; +pub const Comdat = struct { + name: StrtabString, + kind: SelectionKind, + + /// Matches LLVM's bitc::ComdatSelectionKindCodes. + pub const SelectionKind = enum(u3) { + any = 1, + exactmatch = 2, + largest = 3, + nodeduplicate = 4, + samesize = 5, + }; + + pub const Index = enum(u32) { + /// Stored 1-based to match the bitcode encoding (0 = no comdat). + none = 0, + _, + + pub fn ptrConst(self: Index, builder: *const Builder) *const Comdat { + return &builder.comdats.items[@intFromEnum(self) - 1]; + } + }; +}; + pub const Variable = struct { global: Global.Index, thread_local: ThreadLocal = .default, mutability: Mutability = .global, init: Constant = .no_init, section: String = .none, + comdat: Comdat.Index = .none, alignment: Alignment = .default, pub const Index = enum(u32) { @@ -2595,6 +2621,10 @@ pub const Variable = struct { self.ptr(builder).section = section; } + pub fn setComdat(self: Index, comdat: Comdat.Index, builder: *Builder) void { + self.ptr(builder).comdat = comdat; + } + pub fn setAlignment(self: Index, alignment: Alignment, builder: *Builder) void { self.ptr(builder).alignment = alignment; } @@ -8603,6 +8633,7 @@ pub fn init(options: Options) Allocator.Error!Builder { .aliases = .{}, .variables = .{}, .functions = .{}, + .comdats = .{}, .strtab_string_map = .{}, .strtab_string_indices = .{}, @@ -8752,6 +8783,7 @@ pub fn deinit(self: *Builder) void { self.variables.deinit(self.gpa); for (self.functions.items) |*function| function.deinit(self.gpa); self.functions.deinit(self.gpa); + self.comdats.deinit(self.gpa); self.strtab_string_map.deinit(self.gpa); self.strtab_string_indices.deinit(self.gpa); @@ -9007,6 +9039,16 @@ pub fn addAliasAssumeCapacity( return alias_index; } +pub fn addComdat( + self: *Builder, + name: StrtabString, + kind: Comdat.SelectionKind, +) Allocator.Error!Comdat.Index { + assert(!name.isAnon()); + try self.comdats.append(self.gpa, .{ .name = name, .kind = kind }); + return @enumFromInt(self.comdats.items.len); +} + pub fn addVariable( self: *Builder, name: StrtabString, @@ -9564,6 +9606,14 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void , .{ id.fmt(self), ty.fmt(self, .default) }); } + if (self.comdats.items.len > 0) { + if (need_newline) try w.writeByte('\n') else need_newline = true; + for (self.comdats.items) |comdat| try w.print( + \\${f} = comdat {s} + \\ + , .{ comdat.name.fmt(self, .quote_unless_valid_identifier), @tagName(comdat.kind) }); + } + if (self.variables.items.len > 0) { if (need_newline) try w.writeByte('\n') else need_newline = true; for (self.variables.items) |variable| { @@ -9572,7 +9622,7 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void metadata_formatter.need_comma = true; defer metadata_formatter.need_comma = undefined; try w.print( - \\{f} ={f}{f}{f}{f}{f}{f}{f}{f} {s} {f}{f}{f}{f} + \\{f} ={f}{f}{f}{f}{f}{f}{f}{f} {s} {f}{f}{s}{f}{f} \\ , .{ variable.global.fmt(self), @@ -9589,6 +9639,7 @@ pub fn print(self: *Builder, w: *Writer) (Writer.Error || Allocator.Error)!void @tagName(variable.mutability), global.type.fmt(self, .percent), variable.init.fmt(self, .{ .space = true }), + if (variable.comdat != .none) ", comdat" else "", variable.alignment.fmt(", "), try metadata_formatter.fmt("!dbg ", global.dbg, null), }); @@ -13663,6 +13714,18 @@ pub fn toBitcode(self: *Builder, allocator: Allocator, producer: Producer) bitco defer section_map.deinit(self.gpa); try section_map.ensureUnusedCapacity(self.gpa, globals.count()); + // COMDAT records must precede any global that references them by index. + for (self.comdats.items) |comdat| { + const name_index = comdat.name.toIndex().?; + const offset = self.strtab_string_indices.items[name_index]; + const size = self.strtab_string_indices.items[name_index + 1] - offset; + try module_block.writeAbbrev(Module.Comdat{ + .strtab_offset = offset, + .strtab_size = size, + .selection_kind = comdat.kind, + }); + } + for (self.variables.items) |variable| { if (variable.global.getReplacement(self) != .none) continue; @@ -13706,6 +13769,7 @@ pub fn toBitcode(self: *Builder, allocator: Allocator, producer: Producer) bitco .unnamed_addr = global.unnamed_addr, .externally_initialized = global.externally_initialized, .dllstorageclass = global.dll_storage_class, + .comdat = @intFromEnum(variable.comdat), .preemption = global.preemption, }); } diff --git a/lib/std/zig/llvm/ir.zig b/lib/std/zig/llvm/ir.zig index 824186efb876..d2c769c20cdf 100644 --- a/lib/std/zig/llvm/ir.zig +++ b/lib/std/zig/llvm/ir.zig @@ -193,6 +193,7 @@ pub const Module = struct { Variable, Function, Alias, + Comdat, }; pub const Version = struct { @@ -211,6 +212,18 @@ pub const Module = struct { string: []const u8, }; + pub const Comdat = struct { + pub const ops = [_]AbbrevOp{ + .{ .literal = 12 }, // MODULE_CODE_COMDAT + .{ .vbr = 16 }, // strtab_offset + .{ .vbr = 16 }, // strtab_size + .{ .fixed = @bitSizeOf(Builder.Comdat.SelectionKind) }, + }; + strtab_offset: usize, + strtab_size: usize, + selection_kind: Builder.Comdat.SelectionKind, + }; + pub const Variable = struct { const AddrSpaceAndIsConst = packed struct { is_const: bool, @@ -233,7 +246,7 @@ pub const Module = struct { .{ .fixed = @bitSizeOf(Builder.UnnamedAddr) }, .{ .fixed = @bitSizeOf(Builder.ExternallyInitialized) }, .{ .fixed = @bitSizeOf(Builder.DllStorageClass) }, - .{ .literal = 0 }, // comdat + .{ .vbr = 16 }, // comdat .{ .literal = 0 }, // attributes .{ .fixed = @bitSizeOf(Builder.Preemption) }, }; @@ -250,6 +263,7 @@ pub const Module = struct { unnamed_addr: Builder.UnnamedAddr, externally_initialized: Builder.ExternallyInitialized, dllstorageclass: Builder.DllStorageClass, + comdat: u32, preemption: Builder.Preemption, }; diff --git a/src/Air.zig b/src/Air.zig index 77080386384d..ec2dc44f252e 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -2154,8 +2154,15 @@ pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index } pub const typesFullyResolved = types_resolved.typesFullyResolved; -pub const typeFullyResolved = types_resolved.checkType; -pub const valFullyResolved = types_resolved.checkVal; +pub const resolveTypesFully = types_resolved.resolveTypesFully; +/// `checkType`/`checkVal` only allocate when `tls_resolve_pt` is set (i.e. via +/// `resolveTypesFully`); these wrappers are for the non-resolving query path. +pub fn typeFullyResolved(ty: Type, zcu: *Zcu) bool { + return types_resolved.checkType(ty, zcu) catch unreachable; +} +pub fn valFullyResolved(val: Value, zcu: *Zcu) bool { + return types_resolved.checkVal(val, zcu) catch unreachable; +} pub const legalize = Legalize.legalize; pub const write = print.write; pub const writeInst = print.writeInst; diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig index 44669b82df87..cd53a274a62f 100644 --- a/src/Air/types_resolved.zig +++ b/src/Air/types_resolved.zig @@ -1,3 +1,5 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; const Air = @import("../Air.zig"); const Zcu = @import("../Zcu.zig"); const Type = @import("../Type.zig"); @@ -7,10 +9,27 @@ const InternPool = @import("../InternPool.zig"); /// Given a body of AIR instructions, returns whether all type resolution necessary for codegen is complete. /// If `false`, then type resolution must have failed, so codegen cannot proceed. pub fn typesFullyResolved(air: Air, zcu: *Zcu) bool { - return checkBody(air, air.getMainBody(), zcu); + // `tls_resolve_pt` is null here, so `resolveFully` is never called and + // `checkBody` cannot return `error.OutOfMemory`. + return checkBody(air, air.getMainBody(), zcu) catch unreachable; } -fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { +/// Under parallel Sema, `resolve_type_fully` and `codegen_func` run +/// concurrently, so types may be mid-resolution rather than failed. Walk the +/// same AIR shape as `typesFullyResolved` but force-resolve each struct/union +/// (blocking on `claimOrWait`-gated resolution). Returns false only if +/// resolution itself errors. +pub fn resolveTypesFully(air: Air, pt: Zcu.PerThread) Allocator.Error!bool { + tls_resolve_pt = pt; + defer tls_resolve_pt = null; + return checkBody(air, air.getMainBody(), pt.zcu); +} + +/// `checkType` is reached via a long instruction walk; thread the optional +/// PerThread via tls instead of plumbing it through every switch arm. +threadlocal var tls_resolve_pt: ?Zcu.PerThread = null; + +fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) Allocator.Error!bool { const tags = air.instructions.items(.tag); const datas = air.instructions.items(.data); @@ -20,7 +39,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .inferred_alloc, .inferred_alloc_comptime => unreachable, .arg => { - if (!checkType(data.arg.ty.toType(), zcu)) return false; + if (!try checkType(data.arg.ty.toType(), zcu)) return false; }, .add, @@ -89,8 +108,8 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .atomic_store_release, .atomic_store_seq_cst, => { - if (!checkRef(data.bin_op.lhs, zcu)) return false; - if (!checkRef(data.bin_op.rhs, zcu)) return false; + if (!try checkRef(data.bin_op.lhs, zcu)) return false; + if (!try checkRef(data.bin_op.rhs, zcu)) return false; }, .not, @@ -139,15 +158,15 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .c_va_arg, .c_va_copy, => { - if (!checkType(data.ty_op.ty.toType(), zcu)) return false; - if (!checkRef(data.ty_op.operand, zcu)) return false; + if (!try checkType(data.ty_op.ty.toType(), zcu)) return false; + if (!try checkRef(data.ty_op.operand, zcu)) return false; }, .alloc, .ret_ptr, .c_va_start, => { - if (!checkType(data.ty, zcu)) return false; + if (!try checkType(data.ty, zcu)) return false; }, .ptr_add, @@ -161,17 +180,17 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .ptr_elem_ptr, => { const bin = air.extraData(Air.Bin, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(bin.lhs, zcu)) return false; - if (!checkRef(bin.rhs, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(bin.lhs, zcu)) return false; + if (!try checkRef(bin.rhs, zcu)) return false; }, .block, .loop, => { const extra = air.extraData(Air.Block, data.ty_pl.payload); - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkBody( + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]), zcu, @@ -180,8 +199,8 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .dbg_inline_block => { const extra = air.extraData(Air.DbgInlineBlock, data.ty_pl.payload); - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkBody( + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]), zcu, @@ -221,51 +240,51 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .c_va_end, .set_err_return_trace, => { - if (!checkRef(data.un_op, zcu)) return false; + if (!try checkRef(data.un_op, zcu)) return false; }, .br, .switch_dispatch => { - if (!checkRef(data.br.operand, zcu)) return false; + if (!try checkRef(data.br.operand, zcu)) return false; }, .cmp_vector, .cmp_vector_optimized, => { const extra = air.extraData(Air.VectorCmp, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.lhs, zcu)) return false; - if (!checkRef(extra.rhs, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.lhs, zcu)) return false; + if (!try checkRef(extra.rhs, zcu)) return false; }, .reduce, .reduce_optimized, => { - if (!checkRef(data.reduce.operand, zcu)) return false; + if (!try checkRef(data.reduce.operand, zcu)) return false; }, .struct_field_ptr, .struct_field_val, => { const extra = air.extraData(Air.StructField, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.struct_operand, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.struct_operand, zcu)) return false; }, .shuffle_one => { const unwrapped = air.unwrapShuffleOne(zcu, inst); - if (!checkType(unwrapped.result_ty, zcu)) return false; - if (!checkRef(unwrapped.operand, zcu)) return false; + if (!try checkType(unwrapped.result_ty, zcu)) return false; + if (!try checkRef(unwrapped.operand, zcu)) return false; for (unwrapped.mask) |m| switch (m.unwrap()) { .elem => {}, - .value => |val| if (!checkVal(.fromInterned(val), zcu)) return false, + .value => |val| if (!try checkVal(.fromInterned(val), zcu)) return false, }; }, .shuffle_two => { const unwrapped = air.unwrapShuffleTwo(zcu, inst); - if (!checkType(unwrapped.result_ty, zcu)) return false; - if (!checkRef(unwrapped.operand_a, zcu)) return false; - if (!checkRef(unwrapped.operand_b, zcu)) return false; + if (!try checkType(unwrapped.result_ty, zcu)) return false; + if (!try checkRef(unwrapped.operand_a, zcu)) return false; + if (!try checkRef(unwrapped.operand_b, zcu)) return false; // No values to check because there are no comptime-known values other than undef }, @@ -273,73 +292,73 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .cmpxchg_strong, => { const extra = air.extraData(Air.Cmpxchg, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.ptr, zcu)) return false; - if (!checkRef(extra.expected_value, zcu)) return false; - if (!checkRef(extra.new_value, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.ptr, zcu)) return false; + if (!try checkRef(extra.expected_value, zcu)) return false; + if (!try checkRef(extra.new_value, zcu)) return false; }, .aggregate_init => { const ty = data.ty_pl.ty.toType(); const elems_len: usize = @intCast(ty.arrayLen(zcu)); const elems: []const Air.Inst.Ref = @ptrCast(air.extra.items[data.ty_pl.payload..][0..elems_len]); - if (!checkType(ty, zcu)) return false; + if (!try checkType(ty, zcu)) return false; if (ty.zigTypeTag(zcu) == .@"struct") { for (elems, 0..) |elem, elem_idx| { if (ty.structFieldIsComptime(elem_idx, zcu)) continue; - if (!checkRef(elem, zcu)) return false; + if (!try checkRef(elem, zcu)) return false; } } else { for (elems) |elem| { - if (!checkRef(elem, zcu)) return false; + if (!try checkRef(elem, zcu)) return false; } } }, .union_init => { const extra = air.extraData(Air.UnionInit, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.init, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.init, zcu)) return false; }, .field_parent_ptr => { const extra = air.extraData(Air.FieldParentPtr, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.field_ptr, zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.field_ptr, zcu)) return false; }, .atomic_load => { - if (!checkRef(data.atomic_load.ptr, zcu)) return false; + if (!try checkRef(data.atomic_load.ptr, zcu)) return false; }, .prefetch => { - if (!checkRef(data.prefetch.ptr, zcu)) return false; + if (!try checkRef(data.prefetch.ptr, zcu)) return false; }, .vector_store_elem => { const bin = air.extraData(Air.Bin, data.vector_store_elem.payload).data; - if (!checkRef(data.vector_store_elem.vector_ptr, zcu)) return false; - if (!checkRef(bin.lhs, zcu)) return false; - if (!checkRef(bin.rhs, zcu)) return false; + if (!try checkRef(data.vector_store_elem.vector_ptr, zcu)) return false; + if (!try checkRef(bin.lhs, zcu)) return false; + if (!try checkRef(bin.rhs, zcu)) return false; }, .runtime_nav_ptr => { - if (!checkType(.fromInterned(data.ty_nav.ty), zcu)) return false; + if (!try checkType(.fromInterned(data.ty_nav.ty), zcu)) return false; }, .select, .mul_add, => { const bin = air.extraData(Air.Bin, data.pl_op.payload).data; - if (!checkRef(data.pl_op.operand, zcu)) return false; - if (!checkRef(bin.lhs, zcu)) return false; - if (!checkRef(bin.rhs, zcu)) return false; + if (!try checkRef(data.pl_op.operand, zcu)) return false; + if (!try checkRef(bin.lhs, zcu)) return false; + if (!try checkRef(bin.rhs, zcu)) return false; }, .atomic_rmw => { const extra = air.extraData(Air.AtomicRmw, data.pl_op.payload).data; - if (!checkRef(data.pl_op.operand, zcu)) return false; - if (!checkRef(extra.operand, zcu)) return false; + if (!try checkRef(data.pl_op.operand, zcu)) return false; + if (!try checkRef(extra.operand, zcu)) return false; }, .call, @@ -349,21 +368,21 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { => { const extra = air.extraData(Air.Call, data.pl_op.payload); const args: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end..][0..extra.data.args_len]); - if (!checkRef(data.pl_op.operand, zcu)) return false; - for (args) |arg| if (!checkRef(arg, zcu)) return false; + if (!try checkRef(data.pl_op.operand, zcu)) return false; + for (args) |arg| if (!try checkRef(arg, zcu)) return false; }, .dbg_var_ptr, .dbg_var_val, .dbg_arg_inline, => { - if (!checkRef(data.pl_op.operand, zcu)) return false; + if (!try checkRef(data.pl_op.operand, zcu)) return false; }, .@"try", .try_cold => { const extra = air.extraData(Air.Try, data.pl_op.payload); - if (!checkRef(data.pl_op.operand, zcu)) return false; - if (!checkBody( + if (!try checkRef(data.pl_op.operand, zcu)) return false; + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]), zcu, @@ -372,9 +391,9 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .try_ptr, .try_ptr_cold => { const extra = air.extraData(Air.TryPtr, data.ty_pl.payload); - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.data.ptr, zcu)) return false; - if (!checkBody( + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkRef(extra.data.ptr, zcu)) return false; + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end..][0..extra.data.body_len]), zcu, @@ -383,13 +402,13 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .cond_br => { const extra = air.extraData(Air.CondBr, data.pl_op.payload); - if (!checkRef(data.pl_op.operand, zcu)) return false; - if (!checkBody( + if (!try checkRef(data.pl_op.operand, zcu)) return false; + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end..][0..extra.data.then_body_len]), zcu, )) return false; - if (!checkBody( + if (!try checkBody( air, @ptrCast(air.extra.items[extra.end + extra.data.then_body_len ..][0..extra.data.else_body_len]), zcu, @@ -398,29 +417,29 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .switch_br, .loop_switch_br => { const switch_br = air.unwrapSwitch(inst); - if (!checkRef(switch_br.operand, zcu)) return false; + if (!try checkRef(switch_br.operand, zcu)) return false; var it = switch_br.iterateCases(); while (it.next()) |case| { - for (case.items) |item| if (!checkRef(item, zcu)) return false; + for (case.items) |item| if (!try checkRef(item, zcu)) return false; for (case.ranges) |range| { - if (!checkRef(range[0], zcu)) return false; - if (!checkRef(range[1], zcu)) return false; + if (!try checkRef(range[0], zcu)) return false; + if (!try checkRef(range[1], zcu)) return false; } - if (!checkBody(air, case.body, zcu)) return false; + if (!try checkBody(air, case.body, zcu)) return false; } - if (!checkBody(air, it.elseBody(), zcu)) return false; + if (!try checkBody(air, it.elseBody(), zcu)) return false; }, .assembly => { const extra = air.extraData(Air.Asm, data.ty_pl.payload); - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; + if (!try checkType(data.ty_pl.ty.toType(), zcu)) return false; // Luckily, we only care about the inputs and outputs, so we don't have to do // the whole null-terminated string dance. const outputs_len = extra.data.flags.outputs_len; const outputs: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end..][0..outputs_len]); const inputs: []const Air.Inst.Ref = @ptrCast(air.extra.items[extra.end + outputs_len ..][0..extra.data.inputs_len]); - for (outputs) |output| if (output != .none and !checkRef(output, zcu)) return false; - for (inputs) |input| if (input != .none and !checkRef(input, zcu)) return false; + for (outputs) |output| if (output != .none and !try checkRef(output, zcu)) return false; + for (inputs) |input| if (input != .none and !try checkRef(input, zcu)) return false; }, .trap, @@ -444,7 +463,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { return true; } -fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) bool { +fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) Allocator.Error!bool { const ip_index = ref.toInterned() orelse { // This operand refers back to a previous instruction. // We have already checked that instruction's type. @@ -454,11 +473,11 @@ fn checkRef(ref: Air.Inst.Ref, zcu: *Zcu) bool { return checkVal(Value.fromInterned(ip_index), zcu); } -pub fn checkVal(val: Value, zcu: *Zcu) bool { +pub fn checkVal(val: Value, zcu: *Zcu) Allocator.Error!bool { const ty = val.typeOf(zcu); - if (!checkType(ty, zcu)) return false; + if (!try checkType(ty, zcu)) return false; if (val.isUndef(zcu)) return true; - if (ty.toIntern() == .type_type and !checkType(val.toType(), zcu)) return false; + if (ty.toIntern() == .type_type and !try checkType(val.toType(), zcu)) return false; // Check for lazy values switch (zcu.intern_pool.indexToKey(val.toIntern())) { .int => |int| switch (int.storage) { @@ -471,7 +490,7 @@ pub fn checkVal(val: Value, zcu: *Zcu) bool { } } -pub fn checkType(ty: Type, zcu: *Zcu) bool { +pub fn checkType(ty: Type, zcu: *Zcu) Allocator.Error!bool { const ip = &zcu.intern_pool; if (ty.isGenericPoison()) return true; return switch (ty.zigTypeTag(zcu)) { @@ -507,12 +526,19 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool { const info = zcu.typeToFunc(ty).?; for (0..info.param_types.len) |i| { const param_ty = info.param_types.get(ip)[i]; - if (!checkType(Type.fromInterned(param_ty), zcu)) return false; + if (!try checkType(Type.fromInterned(param_ty), zcu)) return false; } return checkType(Type.fromInterned(info.return_type), zcu); }, .@"struct" => switch (ip.indexToKey(ty.toIntern())) { .struct_type => { + if (tls_resolve_pt) |pt| { + ty.resolveFully(pt) catch |e| switch (e) { + error.OutOfMemory => |oom| return oom, + error.AnalysisFail => return false, + }; + return true; + } const struct_obj = zcu.typeToStruct(ty).?; return switch (struct_obj.layout) { .@"packed" => struct_obj.backingIntTypeUnordered(ip) != .none, @@ -524,12 +550,21 @@ pub fn checkType(ty: Type, zcu: *Zcu) bool { const field_is_comptime = tuple.values.get(ip)[i] != .none; if (field_is_comptime) continue; const field_ty = tuple.types.get(ip)[i]; - if (!checkType(Type.fromInterned(field_ty), zcu)) return false; + if (!try checkType(Type.fromInterned(field_ty), zcu)) return false; } return true; }, else => unreachable, }, - .@"union" => return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved, + .@"union" => { + if (tls_resolve_pt) |pt| { + ty.resolveFully(pt) catch |e| switch (e) { + error.OutOfMemory => |oom| return oom, + error.AnalysisFail => return false, + }; + return true; + } + return zcu.typeToUnion(ty).?.flagsUnordered(ip).status == .fully_resolved; + }, }; } diff --git a/src/Compilation.zig b/src/Compilation.zig index 4f4362459c3b..ac1871ea983b 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -43,6 +43,7 @@ const Zir = std.zig.Zir; const Air = @import("Air.zig"); const Builtin = @import("Builtin.zig"); const LlvmObject = @import("codegen/llvm.zig").Object; +const LlvmPartitionSet = @import("codegen/llvm.zig").PartitionSet; const dev = @import("dev.zig"); const DeprecatedLinearFifo = @import("deprecated.zig").LinearFifo; @@ -125,6 +126,12 @@ work_queues: [ break :len len; } ]DeprecatedLinearFifo(Job), +/// Protects `work_queues` when Sema runs on worker threads and calls `queueJob`. +work_queue_mutex: std.Thread.Mutex = .{}, +/// Signalled by `queueJob` and when `sema_pending_jobs` reaches 0, so the +/// dispatch loop in `performAllTheWork` parks instead of busy-spinning on +/// `Thread.yield()` while parallel-Sema workers are running. +work_queue_cond: std.Thread.Condition = .{}, /// These jobs are to invoke the Clang compiler to create an object file, which /// gets linked with the Compilation. @@ -265,11 +272,19 @@ link_prog_node: std.Progress.Node = std.Progress.Node.none, llvm_opt_bisect_limit: c_int, llvm_codegen_threads: u32, +llvm_shard_stats: bool, no_link_obj: bool, +/// When true, the N shard `.o` files emitted by partitioned LLVM codegen are +/// left as-is (no relocatable -r merge). They land at `{emit}.{i}.o` next to +/// the would-be merged output. The downstream linker consumes them directly. +no_merge_shards: bool, time_report: ?TimeReport, file_system_inputs: ?*std.ArrayListUnmanaged(u8), +/// Guards `file_system_inputs` appends. Called from `newEmbedFile` (sema +/// workers) and C-object workers concurrently under parallel sema. +file_system_inputs_mutex: std.Thread.Mutex = .{}, /// This is the digest of the cache for the current compilation. /// This digest will be known after update() is called. @@ -1729,7 +1744,9 @@ pub const CreateOptions = struct { linker_print_map: bool = false, llvm_opt_bisect_limit: i32 = -1, llvm_codegen_threads: u32 = 0, + llvm_shard_stats: bool = false, no_link_obj: bool = false, + llvm_no_merge_shards: bool = false, build_id: ?std.zig.BuildId = null, disable_c_depfile: bool = false, linker_z_nodelete: bool = false, @@ -2171,6 +2188,12 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options cache.hash.add(options.emit_llvm_ir != .no); cache.hash.add(options.emit_llvm_bc != .no); cache.hash.add(options.emit_docs != .no); + // Sharded codegen changes the output file *set* (one merged object vs. + // N shard objects), so the count and the merge/no-link knobs must be + // part of the cache key. + cache.hash.add(options.llvm_codegen_threads); + cache.hash.add(options.llvm_no_merge_shards); + cache.hash.add(options.no_link_obj); // TODO audit this and make sure everything is in it const main_mod = options.main_mod orelse options.root_mod; @@ -2298,7 +2321,15 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options .framework_dirs = options.framework_dirs, .llvm_opt_bisect_limit = options.llvm_opt_bisect_limit, .llvm_codegen_threads = options.llvm_codegen_threads, - .no_link_obj = options.no_link_obj, + .llvm_shard_stats = options.llvm_shard_stats, + // Partitioned LLVM output produces N objects which must be merged + // by the linker for a single-.o result, so the no-link shortcut + // does not apply unless `--llvm-no-merge-shards` is also set, in + // which case the N shard `.o` files are emitted directly to the + // final location and the relocatable merge is skipped entirely. + .no_link_obj = options.no_link_obj and + (options.llvm_codegen_threads <= 1 or options.llvm_no_merge_shards), + .no_merge_shards = options.llvm_no_merge_shards and options.llvm_codegen_threads > 1, .skip_linker_dependencies = options.skip_linker_dependencies, .queued_jobs = .{}, .function_sections = options.function_sections, @@ -2506,7 +2537,16 @@ pub fn create(gpa: Allocator, arena: Allocator, diag: *CreateDiagnostic, options if (use_llvm) { if (opt_zcu) |zcu| { - zcu.llvm_object = try LlvmObject.create(arena, comp); + // Multi-shard emission only supports producing N object files + // for the linker; IR/BC/asm requests for a single output would + // silently drop shards 1..N. Clamp to 1 in that case. + const single_artifact_only = options.emit_bin == .no and + (options.emit_llvm_ir != .no or options.emit_llvm_bc != .no or options.emit_asm != .no); + const n_shards: u32 = if (options.llvm_codegen_threads <= 1 or single_artifact_only) + 1 + else + options.llvm_codegen_threads; + zcu.llvm_object = try LlvmPartitionSet.create(arena, comp, n_shards); } } @@ -3129,7 +3169,13 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE try pt.populateTestFunctions(); } + comp.phaseTimingC("update.processExports.start"); try pt.processExports(); + comp.phaseTimingC("update.processExports.done"); + } + + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + comp.dumpLlvmShardStats(zcu); } if (build_options.enable_debug_extensions and comp.verbose_intern_pool) { @@ -3267,9 +3313,70 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) UpdateE } } +fn dumpLlvmShardStats(comp: *Compilation, zcu: *Zcu) void { + const ip = &zcu.intern_pool; + const n: u32 = @min(if (comp.llvm_codegen_threads > 1) comp.llvm_codegen_threads else 16, 256); + var counts = [_]u32{0} ** 256; + var top_file = [_]?*Zcu.File{null} ** 256; + var top_file_count = [_]u32{0} ** 256; + + const PerFileKey = struct { file: *Zcu.File, shard: u8 }; + var per_file = std.AutoHashMap(PerFileKey, u32).init(comp.gpa); + defer per_file.deinit(); + + const total_navs = ip.navCount(); + var skipped: u32 = 0; + var i: u32 = 0; + while (i < total_navs) : (i += 1) { + const nav_index = ip.navIndexFromOrdinal(i); + const nav = ip.getNav(nav_index); + if (nav.status == .unresolved) { + skipped += 1; + continue; + } + const shard: u8 = @intCast(zcu.navShard(nav_index, n)); + counts[shard] += 1; + const file = zcu.fileByIndex(nav.srcInst(ip).resolveFile(ip)); + const gop = per_file.getOrPut(.{ .file = file, .shard = shard }) catch continue; + if (!gop.found_existing) gop.value_ptr.* = 0; + gop.value_ptr.* += 1; + if (gop.value_ptr.* > top_file_count[shard]) { + top_file_count[shard] = gop.value_ptr.*; + top_file[shard] = file; + } + } + + var min: u32 = std.math.maxInt(u32); + var max: u32 = 0; + var nonempty: u32 = 0; + for (counts[0..n]) |c| { + if (c == 0) continue; + nonempty += 1; + min = @min(min, c); + max = @max(max, c); + } + std.debug.print("llvm-shard-stats for '{s}': n={d} navs={d} skipped={d} nonempty_shards={d}\n", .{ + comp.root_name, n, total_navs - skipped, skipped, nonempty, + }); + for (counts[0..n], 0..) |c, s| { + if (c == 0) continue; + var buf: [512]u8 = undefined; + const key = if (top_file[s]) |f| f.shardKey(&buf) else ""; + std.debug.print(" shard {d:>3}: {d:>6} navs (top file '{s}' = {d})\n", .{ + s, c, key, top_file_count[s], + }); + } + if (min != std.math.maxInt(u32)) { + const ratio = @as(f64, @floatFromInt(max)) / @as(f64, @floatFromInt(min)); + std.debug.print(" max/min ratio: {d:.2} (max={d}, min={d})\n", .{ ratio, max, min }); + } +} + pub fn appendFileSystemInput(comp: *Compilation, path: Compilation.Path) Allocator.Error!void { const gpa = comp.gpa; const fsi = comp.file_system_inputs orelse return; + comp.file_system_inputs_mutex.lock(); + defer comp.file_system_inputs_mutex.unlock(); const prefixes = comp.cache_parent.prefixes(); const want_prefix_dir: Cache.Directory = switch (path.root) { @@ -3336,6 +3443,7 @@ fn flush( arena: Allocator, tid: Zcu.PerThread.Id, ) Allocator.Error!void { + comp.phaseTimingC("flush.start"); if (comp.zcu) |zcu| { if (zcu.llvm_object) |llvm_object| { const pt: Zcu.PerThread = .activate(zcu, tid); @@ -3364,19 +3472,22 @@ fn flush( }; // Generate parallel codegen output filenames if enabled - const bin_path_list: ?[]const [*:0]const u8 = if (comp.llvm_codegen_threads > 1 and base_bin_path != null) blk: { - const num_threads = comp.llvm_codegen_threads; + const bin_path_list: ?[]const [*:0]const u8 = if (llvm_object.n > 1 and base_bin_path != null) blk: { + const num_threads = llvm_object.n; const list = try arena.alloc([*:0]const u8, num_threads); const base_path_slice = std.mem.sliceTo(base_bin_path.?, 0); - // Strip .o extension if present - const base_name: []const u8 = if (std.mem.endsWith(u8, base_path_slice, ".o")) - base_path_slice[0 .. base_path_slice.len - 2] + // Strip the target's object-file extension (.o for ELF/Mach-O, + // .obj for COFF) so shards become `{stem}.{i}{ext}`. + const target = &comp.root_mod.resolved_target.result; + const obj_ext = target.ofmt.fileExt(target.cpu.arch); + const base_name: []const u8 = if (std.mem.endsWith(u8, base_path_slice, obj_ext)) + base_path_slice[0 .. base_path_slice.len - obj_ext.len] else base_path_slice; for (0..num_threads) |i| { - list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}.o", .{base_name, i}, 0)).ptr; + list[i] = (try std.fmt.allocPrintSentinel(arena, "{s}.{d}{s}", .{ base_name, i, obj_ext }, 0)).ptr; } break :blk list; } else null; @@ -3414,6 +3525,7 @@ fn flush( error.LinkFailure => {}, // Already reported. error.OutOfMemory => return error.OutOfMemory, }; + comp.phaseTimingC("flush.llvm_emit_done"); } } if (comp.bin_file) |lf| { @@ -3430,7 +3542,14 @@ fn flush( error.LinkFailure => {}, // Already reported. error.OutOfMemory => return error.OutOfMemory, }; + } else if (comp.no_merge_shards) { + // Shard objects went to `{emit}.{i}.o`; the 0-byte stub the linker + // created at `{emit}` during open() will never be flushed. Remove + // it so downstream build systems globbing `{emit}.*.o` aren't + // confused by an empty object alongside the real shards. + lf.emit.root_dir.handle.deleteFile(lf.emit.sub_path) catch {}; } + comp.phaseTimingC("flush.lf_flush_done"); } if (comp.zcu) |zcu| { try link.File.C.flushEmitH(zcu); @@ -4629,10 +4748,20 @@ pub fn unableToLoadZcuFile( }); } +pub fn phaseTiming(label: []const u8) void { + if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return; + std.debug.print("[PHASE] {d} - {s}\n", .{ std.time.milliTimestamp(), label }); +} +fn phaseTimingC(comp: *const Compilation, label: []const u8) void { + if (!std.process.hasNonEmptyEnvVarConstant("ZIG_PHASE_TIMING")) return; + std.debug.print("[PHASE] {d} {s} {s}\n", .{ std.time.milliTimestamp(), comp.root_name, label }); +} + fn performAllTheWork( comp: *Compilation, main_progress_node: std.Progress.Node, ) JobError!void { + comp.phaseTimingC("performAllTheWork.start"); // Regardless of errors, `comp.zcu` needs to update its generation number. defer if (comp.zcu) |zcu| { zcu.generation += 1; @@ -4657,8 +4786,10 @@ fn performAllTheWork( var work_queue_wait_group: WaitGroup = .{}; defer work_queue_wait_group.wait(); + defer comp.phaseTimingC("performAllTheWork.codegen_wait_done"); comp.link_task_wait_group.reset(); defer comp.link_task_wait_group.wait(); + defer comp.phaseTimingC("performAllTheWork.work_loop_done"); // Already-queued prelink tasks comp.link_prog_node.increaseEstimatedTotalItems(comp.link_task_queue.queued_prelink.items.len); @@ -5059,13 +5190,78 @@ fn performAllTheWork( // Start the timer for the "decls" part of the pipeline (Sema, CodeGen, link). decl_work_timer = comp.startTimer(); } + comp.phaseTimingC("performAllTheWork.work_loop_start"); + if (comp.zcu) |zcu| { + // Sub-compilations (compiler_rt, ubsan_rt, etc.) and the build runner + // are small and gain nothing from parallel Sema. For `zig build`, the + // runner is `root_mod` (main_mod is the user's build.zig). + const is_build_runner = std.mem.endsWith(u8, zcu.root_mod.root_src_path, "build_runner.zig"); + zcu.parallel_sema = comp.parent_whole_cache == null and + !is_build_runner and + std.process.hasNonEmptyEnvVarConstant("ZIG_PARALLEL_SEMA"); + } + + var job_ns: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0); + var job_ct: [@typeInfo(Job.Tag).@"enum".fields.len]u64 = @splat(0); + var export_func_pass: u8 = 0; work: while (true) { - for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| { + const maybe_job: ?Job = job: { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); + for (&comp.work_queues) |*work_queue| if (work_queue.readItem()) |job| break :job job; + break :job null; + }; + if (maybe_job) |job| { + if (comp.zcu) |zcu| if (zcu.parallel_sema and job == .analyze_func) { + // Skip dispatch if a worker already holds this unit (or it has + // since been analyzed) — re-queues from the retry path can + // produce duplicate analyze_func jobs and N-1 workers then + // condvar-wait on the one analyzer. + const a = zcu.intern_pool.funcAnalysisUnordered(job.analyze_func); + if (a.is_analyzed) { + _ = zcu.psema_skip_done.rmw(.Add, 1, .monotonic); + continue :work; + } + _ = zcu.psema_dispatched.rmw(.Add, 1, .monotonic); + _ = zcu.sema_pending_jobs.rmw(.Add, 1, .acquire); + comp.thread_pool.spawnWgId(&comp.link_task_wait_group, workerAnalyzeFunc, .{ comp, job.analyze_func }); + continue :work; + }; + const t0 = if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) std.time.nanoTimestamp() else 0; try processOneJob(@intFromEnum(Zcu.PerThread.Id.main), comp, job); + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + job_ns[@intFromEnum(@as(Job.Tag, job))] += @intCast(std.time.nanoTimestamp() - t0); + job_ct[@intFromEnum(@as(Job.Tag, job))] += 1; + } continue :work; - }; + } if (comp.zcu) |zcu| { + if (zcu.sema_pending_jobs.load(.acquire) > 0) { + // Park until a worker enqueues new work or the last + // pending sema job finishes; busy-spinning here contended + // `work_queue_mutex` against every `queueJob` call. + comp.work_queue_mutex.lock(); + if (zcu.sema_pending_jobs.load(.acquire) > 0) { + var any: bool = false; + for (&comp.work_queues) |*q| if (q.count > 0) { + any = true; + break; + }; + if (!any) comp.work_queue_cond.wait(&comp.work_queue_mutex); + } + comp.work_queue_mutex.unlock(); + continue :work; + } + // A worker may have enqueued between our queue read and the + // counter dropping to zero; re-check the queues before exiting. + const drained = drained: { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); + for (&comp.work_queues) |*q| if (q.count > 0) break :drained false; + break :drained true; + }; + if (!drained) continue :work; // If there's no work queued, check if there's anything outdated // which we need to work on, and queue it if so. if (try zcu.findOutdatedToAnalyze()) |outdated| { @@ -5080,17 +5276,57 @@ fn performAllTheWork( }); continue; } + // Final pass under parallel Sema: any exported function whose body + // analysis was dropped by a post-commit retry will not be in + // `nav_map` at processExports time. Re-queue here so the work loop + // drains it before we exit. + if (zcu.parallel_sema and export_func_pass < 3) { + export_func_pass += 1; + var any_queued = false; + for (zcu.single_exports.values()) |idx| { + any_queued = ensureExportFuncQueued(zcu, idx) or any_queued; + } + for (zcu.multi_exports.values()) |info| { + for (info.index..info.index + info.len) |i| { + any_queued = ensureExportFuncQueued(zcu, @enumFromInt(i)) or any_queued; + } + } + if (any_queued) continue; + } zcu.sema_prog_node.end(); zcu.sema_prog_node = .none; } break; } + if (comp.zcu) |zcu| { + if (std.process.hasNonEmptyEnvVarConstant("ZIG_PSEMA_STATS")) { + std.debug.print("[PSEMA] body_runs={d} yields={d} claim_waits={d} dispatched={d} skip_busy={d} skip_done={d}\n", .{ + zcu.psema_body_runs.load(.monotonic), + zcu.psema_yields.load(.monotonic), + zcu.psema_claim_waits.load(.monotonic), + zcu.psema_dispatched.load(.monotonic), + zcu.psema_skip_busy.load(.monotonic), + zcu.psema_skip_done.load(.monotonic), + }); + } + zcu.parallel_sema = false; + } + if (comp.llvm_shard_stats or std.process.hasNonEmptyEnvVarConstant("ZIG_JOB_STATS")) { + std.debug.print("=== work loop job timings (main thread) ===\n", .{}); + inline for (@typeInfo(Job.Tag).@"enum".fields, 0..) |f, i| { + if (job_ct[i] != 0) + std.debug.print(" {s:>24}: {d:>6}ms ({d} jobs)\n", .{ f.name, job_ns[i] / 1_000_000, job_ct[i] }); + } + } } const JobError = Allocator.Error; pub fn queueJob(comp: *Compilation, job: Job) !void { + comp.work_queue_mutex.lock(); + defer comp.work_queue_mutex.unlock(); try comp.work_queues[Job.stage(job)].writeItem(job); + comp.work_queue_cond.signal(); } pub fn queueJobs(comp: *Compilation, jobs: []const Job) !void { @@ -5108,7 +5344,26 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void { comp.link_prog_node.completeOne(); air.deinit(gpa); } - if (!air.typesFullyResolved(zcu)) { + // Under serial Sema, FIFO dispatch guarantees every + // `resolve_type_fully` queued before this body's analysis has + // completed, so `typesFullyResolved == false` means the type + // *failed*. Under parallel Sema both job kinds run concurrently — + // a struct or union may simply be mid-resolution. Dropping the + // body would leave a dangling cross-shard `__N