trinity/utils.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdbool.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "child.h"
#include "debug.h"
#include "deferred-free.h"
#include "locks.h"
#include "objects.h"
#include "params.h"
#include "pc_format.h"
#include "pids.h"
#include "random.h"
#include "rnd.h"
#include "shm.h"
#include "stats.h"
#include "stats_ring.h"
#include "syscall.h"
#include "tables.h"
#include "trinity.h"
#include "utils.h"

/*
 * Use this allocator if you have an object a child writes to that you want
 * all other processes to see.
 *
 * Every allocation is tracked so that VM syscalls (munmap, madvise, mremap,
 * mprotect) can avoid clobbering trinity's own shared state.
 */

static struct {
	unsigned long addr;
	unsigned long size;
} shared_regions[MAX_SHARED_ALLOCS];
unsigned int nr_shared_regions;

/*
 * Bounded overflow tail for registrations that arrive once
 * shared_regions[] is full.  Exists so range_overlaps_shared() (via the
 * bitmap, which is still updated) and range_in_tracked_shared() (via the
 * linear walk extension below) keep protecting fuzzed mm syscalls from
 * clobbering the untracked region instead of silently failing open.
 *
 * Intentionally small: 256 slots is "absorb a moderately over-budget
 * fleet host long enough to fail loudly and tell the operator to raise
 * MAX_SHARED_ALLOCS or move to dynamic resize", not "a second pool to
 * keep growing into".  Exhausting the tail BUG()s in both debug and
 * release; under-protection of a writable shared mapping is the failure
 * class the whole tracker exists to prevent and is never preferable to
 * a loud abort.
 */
#define SHARED_REGIONS_OVERFLOW_TAIL 256

static struct {
	unsigned long addr;
	unsigned long size;
} shared_regions_overflow[SHARED_REGIONS_OVERFLOW_TAIL];
static unsigned int nr_shared_regions_overflow;

/*
 * Bitmap accelerator for range_overlaps_shared().  One bit per
 * SHARED_BITMAP_GRANULARITY-byte chunk of user VA; a set bit means at
 * least one byte in that chunk belongs to a registered shared region.
 *
 * The mm-syscall sanitisers (madvise/mremap/mprotect/munmap/mseal/mbind/
 * process_madvise/remap_file_pages/...) call range_overlaps_shared()
 * once per fuzzed call, often many times per child per second.  The
 * original linear scan over shared_regions[] is O(N) per query with N
 * easily reaching 100+ on a 32-child fleet (per-child childdata,
 * fd_event ring, kcov ring, plus the global reserve).  Replacing the
 * scan with this bitmap turns the hot path into one or two word loads
 * for the common single-page query.
 *
 * Granularity 2 MiB is the natural unit for the conservative
 * over-reject guarantee: any 2 MiB chunk that touches a shared region
 * gets its bit set, and a query whose footprint hits that chunk
 * rejects.  False positives are possible at chunk boundaries (a
 * non-shared page co-located in the same 2 MiB chunk as a shared
 * region rejects too), which is the SAFETY direction -- under-reject
 * would let a fuzzed mmap call clobber trinity's own shared state.
 *
 * Span 1<<47 covers the canonical x86_64 user VA on default
 * (4-level page table) kernels.  Regions registered outside the span
 * trip the BUG_ON in shared_bitmap_mark(); queries entirely outside
 * the span return false (no tracked region can live there because the
 * BUG_ON would have fired).  At 1 bit per 2 MiB, the bitmap is
 * 1<<26 bits = 8 MiB of BSS, but it is mostly zero pages: only the
 * 4 KiB pages that cover actually-set bits ever fault in, so true
 * resident growth is in the kilobytes for a typical fleet host where
 * shared regions cluster in the mmap arena near 0x7f000000....
 */
#define SHARED_BITMAP_GRANULARITY_LOG2	21UL	/* 2 MiB per bit */
#define SHARED_BITMAP_VA_LOG2		47UL	/* 128 TiB user VA span */
#define SHARED_BITMAP_VA_SPAN		(1UL << SHARED_BITMAP_VA_LOG2)
#define SHARED_BITMAP_NBITS		(SHARED_BITMAP_VA_SPAN >> SHARED_BITMAP_GRANULARITY_LOG2)
#define SHARED_BITMAP_BITS_PER_WORD	(8UL * sizeof(unsigned long))
#define SHARED_BITMAP_NWORDS		(SHARED_BITMAP_NBITS / SHARED_BITMAP_BITS_PER_WORD)

static unsigned long shared_region_bitmap[SHARED_BITMAP_NWORDS];

/*
 * Per-chunk refcount paired with shared_region_bitmap above.  Multiple
 * tracked regions may live in the same 2 MiB chunk (every alloc_shared
 * call rounds up to a chunk for bitmap purposes; nothing forbids two
 * adjacent mmaps landing in the same chunk).  The bit must stay set
 * until the LAST tracked region in the chunk is removed -- clearing it
 * on the first untrack would flip the safety invariant from
 * "over-reject" to "under-reject" for the surviving region in the
 * chunk, exactly the failure mode this whole guard exists to prevent.
 *
 * uint16_t covers the worst-case occupancy by a comfortable margin
 * (MAX_SHARED_ALLOCS + overflow tail = 4352 << 65535) and bumps BSS
 * from 8 MiB (the bitmap alone) to 8 MiB + 128 MiB.  Same lazy-faulting
 * argument as the bitmap: only chunks touched by registrations ever
 * fault their backing page in, so true resident growth stays in the
 * tens of KiB for the typical clustered fleet-host layout.
 */
static uint16_t shared_region_refcount[SHARED_BITMAP_NBITS];

static inline bool shared_bitmap_test(unsigned long bit)
{
	return (shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] >>
		(bit % SHARED_BITMAP_BITS_PER_WORD)) & 1UL;
}

static inline void shared_bitmap_set(unsigned long bit)
{
	shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] |=
		1UL << (bit % SHARED_BITMAP_BITS_PER_WORD);
}

static inline void shared_bitmap_clear(unsigned long bit)
{
	shared_region_bitmap[bit / SHARED_BITMAP_BITS_PER_WORD] &=
		~(1UL << (bit % SHARED_BITMAP_BITS_PER_WORD));
}

/*
 * Mark every 2 MiB chunk that intersects [addr, addr+size).  Called
 * from the tail of alloc_shared() and track_shared_region() so the
 * bitmap stays in sync with shared_regions[].  size==0 is a no-op
 * (matches the "empty region overlaps nothing" semantics callers rely
 * on).  An out-of-span registration BUG()s loudly: the linear-scan
 * predecessor would have caught such a region, so silently dropping it
 * here would flip the safety invariant from "over-reject" to
 * "under-reject" -- the exact failure mode this whole guard exists to
 * prevent.
 */
static void shared_bitmap_mark(unsigned long addr, unsigned long size)
{
	unsigned long end, first, last, bit;

	if (size == 0)
		return;

	if (addr >= SHARED_BITMAP_VA_SPAN ||
	    size > SHARED_BITMAP_VA_SPAN - addr) {
		outputerr("shared_bitmap_mark: region 0x%lx+0x%lx outside "
			  "1<<%lu user VA span; widen SHARED_BITMAP_VA_LOG2\n",
			  addr, size, SHARED_BITMAP_VA_LOG2);
		BUG("shared region outside bitmap span");
	}

	end = addr + size - 1;
	first = addr >> SHARED_BITMAP_GRANULARITY_LOG2;
	last = end >> SHARED_BITMAP_GRANULARITY_LOG2;

	for (bit = first; bit <= last; bit++) {
		if (shared_region_refcount[bit] == UINT16_MAX) {
			outputerr("shared_bitmap_mark: refcount overflow at "
				  "chunk %lu for region 0x%lx+0x%lx\n",
				  bit, addr, size);
			BUG("shared region refcount overflow");
		}
		shared_region_refcount[bit]++;
		shared_bitmap_set(bit);
	}
}

/*
 * Inverse of shared_bitmap_mark().  Decrements the per-chunk refcount
 * for every 2 MiB chunk the range spans and clears the bitmap bit only
 * once the chunk's last tracked region is gone.  Called from
 * untrack_shared_region() after a matching shared_regions[] slot has
 * been located, so an inconsistency (refcount==0 on a chunk the caller
 * believes it tracked) is a tree-state bug worth BUG()ing on rather
 * than silently masking -- a stuck bit with refcount==0 would falsely
 * reject every fuzzed mm syscall touching the chunk forever.  An
 * out-of-span unmark cannot occur in practice because shared_bitmap_
 * mark() BUG()s on out-of-span marks, so the symmetric guard here is
 * defence-in-depth, not a reachable path.
 */
static void shared_bitmap_unmark(unsigned long addr, unsigned long size)
{
	unsigned long end, first, last, bit;

	if (size == 0)
		return;

	if (addr >= SHARED_BITMAP_VA_SPAN ||
	    size > SHARED_BITMAP_VA_SPAN - addr) {
		outputerr("shared_bitmap_unmark: region 0x%lx+0x%lx outside "
			  "1<<%lu user VA span\n",
			  addr, size, SHARED_BITMAP_VA_LOG2);
		BUG("shared region outside bitmap span");
	}

	end = addr + size - 1;
	first = addr >> SHARED_BITMAP_GRANULARITY_LOG2;
	last = end >> SHARED_BITMAP_GRANULARITY_LOG2;

	for (bit = first; bit <= last; bit++) {
		if (shared_region_refcount[bit] == 0) {
			outputerr("shared_bitmap_unmark: refcount underflow at "
				  "chunk %lu for region 0x%lx+0x%lx\n",
				  bit, addr, size);
			BUG("shared region refcount underflow");
		}
		if (--shared_region_refcount[bit] == 0)
			shared_bitmap_clear(bit);
	}
}

/*
 * Size-bucket bitmap accelerator for range_overlaps_shared(): companion
 * to the address-keyed shared_region_bitmap above.  Bit i is set
 * whenever at least one tracked shared region currently falls into
 * size bucket i, where bucket i = floor(log2(len)) and covers regions
 * of len in [2^i, 2^(i+1)).  An empty bitmap (no tracked region of any
 * size) is the useful negative the address bitmap has to discover one
 * word at a time: one load here short-circuits the SHARED_BITMAP_NWORDS
 * word-scan over a multi-MiB query, plus the downstream byte-precise
 * walk that confirms a bitmap hit.
 *
 * Distinct concern from shared_region_bitmap above.  That bitmap
 * encodes WHERE tracked regions live (one bit per 2 MiB chunk of user
 * VA); this one encodes only WHETHER any tracked region exists in each
 * size class.  The two are wired in pairs: every register
 * (alloc_shared, track_shared_region, register_shared_overflow) calls
 * shared_bitmap_mark() AND tracked_size_mark(); every untrack (the
 * regular slot AND the overflow tail path in untrack_shared_region)
 * calls shared_bitmap_unmark() AND tracked_size_unmark().  Forgetting
 * the parallel call in a future refactor flips the size bitmap's
 * safety invariant from "empty ⇒ provably no regions" to "empty ⇒
 * silently under-reject"; shared_bitmap_self_check() asserts the
 * positive-path wiring at startup so that class of bug fails loudly.
 *
 * 64 buckets is the natural cap: a single unsigned long stores the
 * whole bitmap, and SHARED_BITMAP_VA_SPAN = 1<<47 bounds the largest
 * possible region at bucket 47 anyway -- buckets 48..63 stay zero on
 * any legitimate registration.  Per-bucket uint16_t refcount keeps the
 * bit set until the LAST region in that size class drops, mirroring
 * the shared_region_refcount discipline on the address bitmap; the
 * 4352-region worst case (MAX_SHARED_ALLOCS + SHARED_REGIONS_OVERFLOW_
 * TAIL) sits comfortably under UINT16_MAX, so a pathological run that
 * crowds every region into one bucket cannot overflow the counter.
 *
 * size==0 is a no-op for the same reason shared_bitmap_mark() no-ops
 * on size==0: the registering caller treats a zero-byte region as "no
 * region" and floor(log2(0)) is undefined, so suppressing the bump
 * here keeps the bitmaps in lockstep and avoids a spurious bucket-0
 * entry that no matching untrack would ever clear.
 */
#define TRACKED_SIZE_NBUCKETS	64
static unsigned long tracked_size_bm;
static uint16_t tracked_size_bucket_count[TRACKED_SIZE_NBUCKETS];

static inline unsigned int tracked_size_bucket(unsigned long len)
{
	return 63u - (unsigned int)__builtin_clzl(len);
}

static void tracked_size_mark(unsigned long len)
{
	unsigned int b;

	if (len == 0)
		return;

	b = tracked_size_bucket(len);
	if (b >= TRACKED_SIZE_NBUCKETS) {
		outputerr("tracked_size_mark: bucket %u out of range for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket out of range");
	}
	if (tracked_size_bucket_count[b] == UINT16_MAX) {
		outputerr("tracked_size_mark: bucket %u refcount overflow for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket refcount overflow");
	}
	if (tracked_size_bucket_count[b]++ == 0)
		tracked_size_bm |= 1UL << b;
}

static void tracked_size_unmark(unsigned long len)
{
	unsigned int b;

	if (len == 0)
		return;

	b = tracked_size_bucket(len);
	if (b >= TRACKED_SIZE_NBUCKETS) {
		outputerr("tracked_size_unmark: bucket %u out of range for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket out of range");
	}
	if (tracked_size_bucket_count[b] == 0) {
		outputerr("tracked_size_unmark: bucket %u refcount underflow for len 0x%lx\n",
			  b, len);
		BUG("tracked_size bucket refcount underflow");
	}
	if (--tracked_size_bucket_count[b] == 0)
		tracked_size_bm &= ~(1UL << b);
}

/*
 * Handle a registration that arrived once shared_regions[] is full.
 *
 * The previous "warn once, then silently drop the region" policy turned
 * an over-budget host into the exact failure mode this whole tracker
 * exists to prevent: range_overlaps_shared() can no longer guard an
 * untracked writable MAP_SHARED region from a fuzzed
 * munmap/mremap/madvise/mprotect, so the next call that picks an
 * unlucky address scribbles trinity's own shared state and the
 * resulting crash looks like a kernel bug.  Silent under-protection of
 * a writable shared mapping is never preferable to a loud abort.
 *
 * New policy, per call:
 *
 *   - Always emit a LOUD outputerr() naming the caller PC (resolved via
 *     pc_to_string, same idiom as log_mprotect_failure()), the offending
 *     region, and the tail occupancy.  Per-call (not cap-once): the
 *     cap-once predecessor hid how badly the cap was over budget, which
 *     is the one piece of data needed to size a real fix.
 *
 *   - Under ASAN (the developer / debug build), BUG() immediately --
 *     overflow is a tree-state bug and we want a stack trace, not a
 *     production-shaped degradation.
 *
 *   - In release, register the region in the bounded overflow tail so
 *     the bitmap stays correct (shared_bitmap_mark already covers the
 *     range) and range_in_tracked_shared() can still match precisely.
 *     Bump shm->stats.shared_region_overflow so the over-budget state
 *     is visible in the periodic stats dump.
 *
 *   - If the overflow tail itself fills, BUG() in both debug and
 *     release.  Two layers of bounded storage is enough; a third would
 *     just be a slower path to the same silent-under-protection bug.
 */
static void register_shared_overflow(const char *who, unsigned long addr,
				     unsigned long size, void *caller)
{
	char pcbuf[128];

	outputerr("shared_regions: %s overflow: region 0x%lx+0x%lx from %s; "
		  "MAX_SHARED_ALLOCS=%d exhausted, overflow tail at %u/%d -- "
		  "raise the cap or move shared_regions[] to dynamic resize\n",
		  who, addr, size,
		  pc_to_string(caller, pcbuf, sizeof(pcbuf)),
		  MAX_SHARED_ALLOCS,
		  nr_shared_regions_overflow, SHARED_REGIONS_OVERFLOW_TAIL);

#ifdef __SANITIZE_ADDRESS__
	BUG("shared_regions[] overflow (debug build)");
#else
	if (nr_shared_regions_overflow >= SHARED_REGIONS_OVERFLOW_TAIL) {
		outputerr("shared_regions: overflow tail also exhausted "
			  "(%d slots); refusing to leave region 0x%lx+0x%lx "
			  "untracked\n",
			  SHARED_REGIONS_OVERFLOW_TAIL, addr, size);
		BUG("shared_regions overflow tail exhausted");
	}

	shared_regions_overflow[nr_shared_regions_overflow].addr = addr;
	shared_regions_overflow[nr_shared_regions_overflow].size = size;
	shared_bitmap_mark(addr, size);
	tracked_size_mark(size);
	nr_shared_regions_overflow++;

	if (shm != NULL)
		__atomic_add_fetch(&shm->stats.shared_region_overflow, 1,
				   __ATOMIC_RELAXED);
#endif
}

void * alloc_shared(size_t size)
{
	void *ret;

	ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
	if (ret == MAP_FAILED) {
		outputerr("mmap %zu failure\n", size);
		exit(EXIT_FAILURE);
	}
	/* poison with independently-random bytes to expose uninitialized reads. */
	{
		unsigned char *p = ret;
		size_t i;

		for (i = 0; i + sizeof(unsigned int) <= size; i += sizeof(unsigned int)) {
			unsigned int r = rnd_u32();
			memcpy(p + i, &r, sizeof(r));
		}
		for (; i < size; i++)
			p[i] = (unsigned char)rnd_u32();
	}

	if (nr_shared_regions < MAX_SHARED_ALLOCS) {
		shared_regions[nr_shared_regions].addr = (unsigned long) ret;
		shared_regions[nr_shared_regions].size = size;
		shared_bitmap_mark((unsigned long) ret, size);
		tracked_size_mark(size);
		nr_shared_regions++;
	} else {
		register_shared_overflow("alloc_shared", (unsigned long) ret,
					 size, __builtin_return_address(0));
	}

	return ret;
}

/*
 * Add an externally-mmap'd region to the shared_regions tracker so the
 * range_overlaps_shared() guards in the mm-syscall sanitisers refuse
 * fuzzed munmap/mremap/madvise/mprotect calls that target it.  Used by
 * code that mmaps via something other than alloc_shared() and still
 * needs the region protected from the fuzzer -- e.g., the per-child
 * kcov ring buffer mapped from /sys/kernel/debug/kcov.
 */
void track_shared_region(unsigned long addr, unsigned long size)
{
	if (nr_shared_regions < MAX_SHARED_ALLOCS) {
		shared_regions[nr_shared_regions].addr = addr;
		shared_regions[nr_shared_regions].size = size;
		shared_bitmap_mark(addr, size);
		tracked_size_mark(size);
		nr_shared_regions++;
	} else {
		register_shared_overflow("track_shared_region", addr, size,
					 __builtin_return_address(0));
	}
}

/*
 * Inverse of track_shared_region() / alloc_shared() registration.
 * Removes the matching shared_regions[] entry (exact addr+size match)
 * and undoes the bitmap refcount/bit it contributed, so providers that
 * munmap their region on destructor (io_uring rings, kvm vCPU run
 * pages) stop accumulating stale slots and stop holding the bitmap bit
 * set after their VA has been recycled to something unrelated.
 *
 * Slot reuse uses swap-with-last compaction: the freed slot inherits
 * the array tail, nr_shared_regions decrements.  Nothing depends on
 * shared_regions[] order beyond shared_bitmap_self_check() peeking at
 * slot 0, and that runs once at init -- well before any destructor can
 * fire -- so the order disturbance is invisible to live code paths
 * (range_overlaps_shared and range_in_tracked_shared both walk the
 * whole array).
 *
 * Walks the overflow tail too: a provider whose registration was
 * parked there is no less tracked from the caller's perspective and
 * must be unregistered the same way; otherwise the tail would only
 * ever grow.
 *
 * A miss returns silently rather than BUG()ing: a caller may
 * legitimately untrack a region whose original track call was a no-op
 * (e.g. size==0), or whose addr+size pair doesn't exactly match a
 * registration (the slot allocator is exact-match only).  Silent miss
 * is the same shape as Linux's __ClearPageReserved on a non-Reserved
 * page -- the inverse of a "best effort" registration is best effort.
 */
void untrack_shared_region(unsigned long addr, unsigned long size)
{
	unsigned int i;

	for (i = 0; i < nr_shared_regions; i++) {
		if (shared_regions[i].addr != addr ||
		    shared_regions[i].size != size)
			continue;
		shared_bitmap_unmark(addr, size);
		tracked_size_unmark(size);
		shared_regions[i] = shared_regions[nr_shared_regions - 1];
		nr_shared_regions--;
		return;
	}

	for (i = 0; i < nr_shared_regions_overflow; i++) {
		if (shared_regions_overflow[i].addr != addr ||
		    shared_regions_overflow[i].size != size)
			continue;
		shared_bitmap_unmark(addr, size);
		tracked_size_unmark(size);
		shared_regions_overflow[i] =
			shared_regions_overflow[nr_shared_regions_overflow - 1];
		nr_shared_regions_overflow--;
		return;
	}
}

bool shared_size_mul(size_t a, size_t b, size_t *out)
{
	return !__builtin_mul_overflow(a, b, out);
}

/*
 * Size-bucketed freelist for shared heap recycling.
 *
 * Eight fixed-size buckets cover the common allocation sizes.  A freed slot
 * whose aligned size falls within a bucket is pushed onto that bucket's
 * lock-free stack; the next alloc of the same size pops it instead of
 * burning new bump space.  Allocations larger than 1024 bytes bypass the
 * freelist and use the bump allocator directly (documented below).
 *
 * The freelist link lives in the slot's own first sizeof(uintptr_t) bytes.
 * This is safe because the slot is not live when the link is written: the
 * caller has just handed it back to us, and we zero the rest of the slot
 * before writing the link so that a use-after-free still surfaces as zero-
 * byte reads rather than as a stale link pointer.
 *
 * CAS ordering: RELAXED is sufficient for the same reason as the bump
 * cursor — the caller publishes the resulting object via add_object()'s
 * RELEASE store, which is the actual synchronisation point for consumers.
 *
 * ABA safety via tagged pointer.  A naive lock-free stack with a single
 * pointer head is vulnerable to the classic ABA race: a popper reads
 * old_head=X and next=*X, but before its CAS another thread pops X, pops
 * X.next, and then pushes X back.  The CAS still sees head==X and succeeds,
 * but it installs the stale "next" value, leaving head pointing at a slot
 * that has already been handed back to a caller.  Two callers then think
 * they own the same slot; the resulting double-use corrupts whichever
 * obj struct was layered over the slot and faults later in unrelated
 * code paths far from the buggy free.
 *
 * The mitigation is a 16-bit version counter packed into the high bits of
 * the head word.  Each push and pop increments the version; the CAS
 * compares the full 64-bit (version, ptr) tuple.  The A→B→A sequence above
 * now leaves the head as (X, ver+2) rather than (X, ver), so the racer's
 * CAS fails on the version mismatch and it retries with a fresh load.
 *
 * The packing exploits the canonical-form invariant of x86_64 user-space
 * virtual addresses: only bits 0-47 are significant, and bit 47 is 0 for
 * any user-space pointer (kernel pointers have bit 47 == 1 and are
 * sign-extended into the upper 16 bits).  We therefore stash the version
 * counter in bits 48-63, recover the pointer with a 48-bit mask, and need
 * no sign extension on read.  The slot's stored "next" link is just the
 * raw pointer (no version bits) — the version lives only in the head.
 *
 * The 16-bit version is finite: a perfectly-timed sequence of exactly
 * 65536 push/pop pairs in the gap between a victim's load and CAS would
 * wrap the version back to its original value and re-expose the race.
 * For a process-bounded fuzzer with sub-microsecond critical sections
 * this is astronomically improbable; if it ever proves observable the
 * head can be widened to a 128-bit (ptr, version) tuple and switched to a
 * cmpxchg16b-based DWCAS without any caller change.
 */

/*
 * The packed (ptr, version) freelist head assumes the top 16 bits of every
 * freelist pointer are zero — i.e. a 48-bit canonical userspace VA range,
 * which is the x86-64 default and is not guaranteed on arm64 (52-bit
 * possible), s390x, riscv, or x86-64 with 5-level paging enabled.  Reject
 * the build explicitly so a future port hits the wall here instead of
 * shipping a subtly-broken allocator.  Removing this guard requires either
 * a DWCAS-based 128-bit head where available or a (struct ptr, generation)
 * variant guarded by a small lock — see the comment block above the head
 * declaration.
 */
#if !defined(__x86_64__)
#error "shm freelist (ptr, version) packing assumes 48-bit userspace VA — port requires DWCAS or struct+lock variant"
#endif

#define FREELIST_PTR_MASK	((uint64_t)((1ULL << 48) - 1))
#define FREELIST_VER_SHIFT	48

static const size_t bucket_sizes[NUM_SHM_FREELIST_BUCKETS] = {
	8, 16, 32, 64, 128, 256, 512, 1024
};

/*
 * Return the index of the smallest bucket that fits an allocation of
 * already-pointer-aligned size, or -1 if size exceeds all buckets.
 */
static int freelist_bucket(size_t aligned_size)
{
	unsigned int i;

	for (i = 0; i < NUM_SHM_FREELIST_BUCKETS; i++) {
		if (aligned_size <= bucket_sizes[i])
			return (int)i;
	}
	return -1;
}

/*
 * Pop a slot from the freelist bucket whose head lives at *head.
 * Returns NULL if the freelist is empty; otherwise returns a fully-zeroed
 * slot of slot_size bytes.  ABA-safe via the version tag in the high
 * bits of the head — see the block comment above.
 */
static void *freelist_pop(uint64_t *head, size_t slot_size)
{
	uint64_t old_tagged, new_tagged;
	uintptr_t ptr, next;
	uint16_t new_ver;

	old_tagged = __atomic_load_n(head, __ATOMIC_RELAXED);
	do {
		ptr = (uintptr_t)(old_tagged & FREELIST_PTR_MASK);
		if (ptr == 0)
			return NULL;
		/* The slot's first word holds the next pointer, with no
		 * version bits — versions live only in the head. */
		next = *(uintptr_t *)ptr;
		new_ver = (uint16_t)(old_tagged >> FREELIST_VER_SHIFT) + 1;
		new_tagged = ((uint64_t)next & FREELIST_PTR_MASK) |
			     ((uint64_t)new_ver << FREELIST_VER_SHIFT);
	} while (!__atomic_compare_exchange_n(head, &old_tagged, new_tagged,
					      false,
					      __ATOMIC_RELAXED,
					      __ATOMIC_RELAXED));

	memset((void *)ptr, 0, slot_size);
	return (void *)ptr;
}

/*
 * Push a slot onto the freelist bucket whose head lives at *head.
 * The entire slot (slot_size bytes) is zeroed first so that a use-after-
 * free reads as zero rather than stale data; then the freelist link is
 * written into the slot's first word before the CAS.  The version tag in
 * the head is incremented on every successful CAS to keep poppers safe
 * from ABA — see the block comment above.
 */
static void freelist_push(uint64_t *head, void *p, size_t slot_size)
{
	uint64_t old_tagged, new_tagged;
	uint16_t new_ver;

	memset(p, 0, slot_size);
	old_tagged = __atomic_load_n(head, __ATOMIC_RELAXED);
	do {
		/* Store only the pointer half of the previous head into our
		 * slot — the version stays in the head word. */
		*(uintptr_t *)p = (uintptr_t)(old_tagged & FREELIST_PTR_MASK);
		new_ver = (uint16_t)(old_tagged >> FREELIST_VER_SHIFT) + 1;
		new_tagged = ((uint64_t)(uintptr_t)p & FREELIST_PTR_MASK) |
			     ((uint64_t)new_ver << FREELIST_VER_SHIFT);
	} while (!__atomic_compare_exchange_n(head, &old_tagged, new_tagged,
					      false,
					      __ATOMIC_RELAXED,
					      __ATOMIC_RELAXED));
}

/*
 * Shared string heap — backing store for string-shaped fields
 * (filenames, label strings, fixed-size attr buffers) hung off objs
 * that live in the shared obj heap.
 *
 * Same MAP_SHARED-before-fork argument as the obj heap: any obj
 * struct that is reachable from shm->global_objects[] must point only
 * at memory that other processes can also reach, and the only way to
 * satisfy that for allocations made after fork (the regen path) is to
 * carve out of a region that was already mapped before any child
 * forked.
 *
 * Why a sibling slab instead of reusing the obj heap:
 *
 *   The obj heap is sized for ~28k struct objects at ~150 B each; if
 *   strings shared the cursor, a regen-heavy provider could starve
 *   future obj allocations and vice versa, with no way for an
 *   exhaustion message to distinguish "out of obj slots" from "out of
 *   string slots".  Splitting the cursor (and the backing region)
 *   keeps each pool's failure mode independent and self-describing.
 *   The sizing tradeoff is also different: obj structs are uniform,
 *   strings are variable-length and dominated by short labels, so the
 *   string pool can be much smaller.
 *
 * Capacity (64 KiB):
 *
 *   The string-bearing OBJ_GLOBAL providers (file/testfile, perf,
 *   memfd) hold short payloads — file paths typically under ~100 B,
 *   memfd labels under ~32 B, perf_event_attr buffers ~120 B (kernel
 *   caps the attr at PAGE_SIZE but trinity only memcpy's a struct's
 *   worth back into the obj for replay/dump).  At ~64 B average that
 *   is ~1k entries; at ~120 B per perf entry it is ~545.  Both
 *   comfortably exceed GLOBAL_OBJ_MAX_CAPACITY (1024) for the labels
 *   case and cover hundreds of regens for the perf case.  If long
 *   fuzz runs show "alloc_shared_str: heap exhausted" the cap can be
 *   raised — bump-and-leak makes growth the only reasonable answer.
 *
 * Why two entry points (alloc_shared_str + alloc_shared_strdup):
 *
 *   The eventual callers split cleanly into two shapes:
 *
 *     - strdup-style:  init_*_fds() and the .open regen hooks have
 *       a NUL-terminated source string (filename, label) and want a
 *       pointer to a stable shm copy of it.  alloc_shared_strdup()
 *       collapses the strlen+alloc+strcpy sequence at the call site.
 *
 *     - empty-buffer:  perf's open_perf_fd() has a fixed-size attr
 *       struct and just wants raw zeroed bytes to memcpy into.
 *       alloc_shared_str(sizeof(struct perf_event_attr)) hands back
 *       exactly that, with no NUL-termination contract.
 *
 *   Both share one heap and one free; the strdup variant is a thin
 *   wrapper around the primitive, not a separate allocator.
 *
 * Free strategy:
 *
 *   Mirror of free_shared_obj — poison the slot to zeros so a
 *   use-after-free surfaces as a "" / NUL-byte read rather than a
 *   live-looking string, then recycle via the size-bucketed freelist
 *   (see freelist_push/pop above).  Callers pass the original
 *   allocation size; for strdup-style strings that is strlen(p)+1 at
 *   free time, which is correct for a still-NUL-terminated string and
 *   harmless overshoot if it isn't (the buffer was zero-initialised).
 *   Slots too large for any freelist bucket are poisoned and leaked.
 */
/*
 * 1 MiB.  Originally sized at 64 KiB for the simple-init case (memfd
 * + perf eventattr + a few testfiles), but bump-and-leak loses a
 * slot for every above-bucket free, and OBJ_LOCAL churn from
 * post_*_fd callbacks plus per-child fanout exhausts 64 KiB within
 * a few hours of a sustained fuzz run.  The freelist recycler now
 * returns slots to the pool, so long-run exhaustion is no longer
 * expected; the 1 MiB ceiling remains as headroom for above-bucket
 * allocations that still bump-and-leak.
 */
#define SHARED_STR_HEAP_SIZE (1U * 1024U * 1024U)

static char *shared_str_heap;
static size_t shared_str_heap_capacity;

static void shared_str_heap_init(void)
{
	/* Same pre-fork-mapping requirement as the obj heap: the first
	 * caller must run in the parent before any child forks, which
	 * holds for all current callers (init_*_fds via open_fds()
	 * before fork_children()).  Assert it so a future child-context
	 * caller cannot silently map a private heap behind the shared
	 * cursor, which would dangle string pointers across processes. */
	if (getpid() != mainpid) {
		outputerr("alloc_shared_str: heap init from child context "
			  "(pid %d, parent %d) -- would dangle shm pointers\n",
			  getpid(), mainpid);
		abort();
	}
	shared_str_heap_capacity = SHARED_STR_HEAP_SIZE;
	shared_str_heap = alloc_shared(shared_str_heap_capacity);
}

void * alloc_shared_str(size_t size)
{
	size_t old_used, new_used;
	void *p;
	int bucket;

	if (size == 0)
		return NULL;

	if (shared_str_heap == NULL)
		shared_str_heap_init();

	/* Round up so each allocation starts pointer-aligned.  The
	 * primitive is generic — strings don't need it, but the empty-
	 * buffer callers (perf_event_attr) do, and one rule keeps the
	 * accounting simple. */
	size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);

	/* Try the freelist before touching the bump cursor. */
	bucket = freelist_bucket(size);
	if (bucket >= 0) {
		p = freelist_pop(&shm->shared_str_freelist[bucket],
				 bucket_sizes[bucket]);
		if (p != NULL)
			return p;
		/* Round bump to bucket size so the first free of a bump
		 * slot doesn't overrun the next slot via freelist_push's
		 * bucket-size memset.  See the matching comment in
		 * alloc_shared_obj for the full bug-class explanation. */
		size = bucket_sizes[bucket];
	}

	/* Lock-free bump via CAS on the shm-resident cursor.  RELAXED
	 * is sufficient for the same reason as alloc_shared_obj: the
	 * caller publishes the obj (and therefore the string pointer)
	 * to consumers via add_object()'s RELEASE store on
	 * num_entries. */
	old_used = __atomic_load_n(&shm->shared_str_heap_used,
				   __ATOMIC_RELAXED);
	do {
		new_used = old_used + size;
		if (new_used > shared_str_heap_capacity) {
			outputerr("alloc_shared_str: heap exhausted "
				  "(cap %zu, used %zu, req %zu)\n",
				  shared_str_heap_capacity, old_used,
				  size);
			return NULL;
		}
	} while (!__atomic_compare_exchange_n(&shm->shared_str_heap_used,
					      &old_used, new_used,
					      false,
					      __ATOMIC_RELAXED,
					      __ATOMIC_RELAXED));

	p = shared_str_heap + old_used;
	memset(p, 0, size);
	return p;
}

char * alloc_shared_strdup(const char *src)
{
	size_t len;
	char *dst;

	if (src == NULL)
		return NULL;

	len = strlen(src) + 1;
	dst = alloc_shared_str(len);
	if (dst == NULL)
		return NULL;

	memcpy(dst, src, len);
	return dst;
}

void free_shared_str(void *p, size_t size)
{
	int bucket;

	if (p == NULL || size == 0)
		return;

	size = (size + sizeof(void *) - 1) & ~(sizeof(void *) - 1);

	bucket = freelist_bucket(size);
	if (bucket >= 0) {
		freelist_push(&shm->shared_str_freelist[bucket], p,
			      bucket_sizes[bucket]);
		return;
	}

	/* Size above all buckets — poison and leak (bump-and-leak fallback). */
	memset(p, 0, size);
}

/*
 * Render a PROT_* mask as "READ|WRITE|EXEC" into the caller-provided
 * buffer.  Empties to "NONE" for prot==PROT_NONE so the diagnostic line
 * is never silently truncated to nothing between the brackets.  Unknown
 * upper bits (PROT_GROWSDOWN, pkey bits, ...) are left to the raw
 * 0x%x rendering at the call site.
 */
static void prot_to_string(int prot, char *buf, size_t buflen)
{
	int n = 0;
	int written;

	if (buf == NULL || buflen == 0)
		return;

	buf[0] = '\0';

	/* snprintf returns the would-have-written length even when truncated.
	 * Naive `n += snprintf(buf+n, buflen-n, ...)` advances n past buflen
	 * once the buffer fills, so the next call writes outside buf.  Currently
	 * safe at buflen=32 with the three short strings but identical cliff
	 * to the stats.c stack-depth histogram cumulator.  Bound n each step. */
	if ((prot & PROT_READ) && (size_t)n < buflen) {
		written = snprintf(buf + n, buflen - (size_t)n, "READ");
		if (written > 0)
			n += written;
	}
	if ((prot & PROT_WRITE) && (size_t)n < buflen) {
		written = snprintf(buf + n, buflen - (size_t)n,
				   "%sWRITE", n ? "|" : "");
		if (written > 0)
			n += written;
	}
	if ((prot & PROT_EXEC) && (size_t)n < buflen) {
		written = snprintf(buf + n, buflen - (size_t)n,
				   "%sEXEC", n ? "|" : "");
		if (written > 0)
			n += written;
	}

	if (n == 0)
		snprintf(buf, buflen, "NONE");
}

static const char *mprotect_errstr(int err)
{
	switch (err) {
	case ENOMEM:	return "ENOMEM";
	case EACCES:	return "EACCES";
	case EINVAL:	return "EINVAL";
	case EAGAIN:	return "EAGAIN";
	case EFAULT:	return "EFAULT";
	default:	return "unknown error";
	}
}

void log_mprotect_failure(void *addr, size_t len, int prot,
			  void *caller, int err)
{
	char protbuf[32];
	char pcbuf[128];

	prot_to_string(prot, protbuf, sizeof(protbuf));
	outputerr("mprotect(addr=%p, len=%zu, prot=0x%x [%s]) failed at %s: %s\n",
		  addr, len, prot, protbuf,
		  pc_to_string(caller, pcbuf, sizeof(pcbuf)),
		  mprotect_errstr(err));
}

/*
 * Self-check: confirm range_overlaps_shared()'s bitmap accelerator
 * actually rejects the first registered region.  Catches construction
 * regressions (a future refactor that forgets to call
 * shared_bitmap_mark() at registration would otherwise fail open and
 * silently let the fuzzer clobber trinity's own shared state).  Runs
 * once -- the bitmap only grows with new registrations, so a single
 * positive assert is sufficient to prove the wiring works. */
void shared_bitmap_self_check(void)
{
	static bool checked;
	unsigned long base, bit;

	if (checked || nr_shared_regions == 0)
		return;
	checked = true;

	base = shared_regions[0].addr;
	bit = base >> SHARED_BITMAP_GRANULARITY_LOG2;
	if (!shared_bitmap_test(bit)) {
		outputerr("range_overlaps_shared bitmap missing first region "
			  "@ 0x%lx (bit %lu)\n", base, bit);
		BUG("shared region bitmap inconsistent");
	}

	/*
	 * Companion size-bucket bitmap should also reflect the first
	 * registered region: any region with non-zero size lands in some
	 * bucket, so tracked_size_bm cannot be empty here.  Catches a
	 * future refactor that wires shared_bitmap_mark() but forgets the
	 * parallel tracked_size_mark() call -- silent under-protection of
	 * the size short-circuit (always-true skip on an empty bitmap)
	 * would defeat the bypass counter on every call.
	 */
	if (shared_regions[0].size != 0 && tracked_size_bm == 0) {
		outputerr("tracked_size_bm empty despite first region size 0x%lx\n",
			  shared_regions[0].size);
		BUG("tracked_size bitmap inconsistent");
	}
}

/* Tunable: how often range_overlaps_shared() emits a -v summary line.
 * Lower = noisier, higher = blunter. */
#define RANGE_OVERLAPS_SHARED_REJECT_REPORT_INTERVAL 10000

/* Sibling tunable for the size-bucket bitmap short-circuit path.
 * Same cadence as the rejects line above so the two -v summaries
 * stay in lockstep when both fire on the same child. */
#define RANGE_OVERLAPS_SHARED_BM_SKIP_REPORT_INTERVAL 10000

/*
 * Exact byte-range overlap confirmation after a bitmap hit.  The
 * bitmap rounds tracked regions up to 2 MiB chunks, so a hit means
 * "some tracked region lives in a chunk that touches the query" --
 * not that the query and the region share a byte.  Walking the two
 * region arrays here resolves the false positives in which a valid,
 * disjoint mm-syscall range shares a 2 MiB chunk with a tracked
 * allocation; the bitmap previously rejected those unconditionally,
 * losing real munmap / mremap / madvise / mprotect / mseal / mbind
 * coverage on hosts where allocations cluster.
 *
 * Semantics match the pre-bitmap byte-precise test:
 *   - a non-empty range [addr, end) overlaps [rstart, rend) iff
 *     addr < rend && rstart < end (standard interval overlap);
 *   - an empty (len == 0) range matches iff @addr is strictly inside
 *     a region (rstart <= addr < rend), which is the only empty-range
 *     case the original test accepted.
 *
 * Zero-size tracked regions never overlap anything, matching the
 * alloc_shared() / track_shared_region() callers that treat size==0
 * as "no region" (and matching shared_bitmap_mark()'s no-op on size 0).
 *
 * Returning false here only removes false-positive rejects; it never