trinity/syscall.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
 * Functions for actually doing the system calls.
 */

#include <errno.h>
#include <limits.h>
#include <stdint.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>

#include "arch.h"
#include "arg_coupling.h"
#include "argtype-ops.h"
#include "child.h"
#include "debug.h"
#include "deferred-free.h"
#include "fd-event.h"
#include "fd.h"
#include "kcov.h"
#include "maps.h"
#include "objects.h"
#include "params.h"
#include "pids.h"
#include "pre_crash_ring.h"
#include "random.h"
#include "results.h"
#include "sanitise.h"
#include "shm.h"
#include "signals.h"
#include "stats_ring.h"
#include "syscall.h"
#include "syscall_record.h"
#include "tables.h"
#include "taint.h"
#include "trinity.h"
#include "uid.h"
#include "utils.h"

#ifdef ARCH_IS_BIARCH
/*
 * This routine does 32 bit syscalls on 64 bit kernel.
 * 32-on-32 will just use syscall() directly from do_syscall() because do32bit flag is biarch only.
 */
static long syscall32(unsigned int call,
	unsigned long a1, unsigned long a2, unsigned long a3,
	unsigned long a4, unsigned long a5, unsigned long a6)
{
	long __res = 0;

#if defined(DO_32_SYSCALL)
	/* If we have CONFIG_IA32_EMULATION unset, we will segfault.
	 * Detect this case, and force 64-bit only.
	 */
	if (__atomic_load_n(&shm->syscalls32_succeeded, __ATOMIC_RELAXED) == false) {
		if (__atomic_load_n(&shm->syscalls32_attempted, __ATOMIC_RELAXED) >= (max_children * 2)) {
			unsigned int i;
			bool did_disable = false;
			unsigned int snap_attempted = 0;

			lock(&shm->syscalltable_lock);

			/* check another thread didn't already do this. */
			if (shm->nr_active_32bit_syscalls != 0) {
				snap_attempted = __atomic_load_n(&shm->syscalls32_attempted, __ATOMIC_RELAXED);

				for (i = 0; i < max_nr_32bit_syscalls; i++) {
					struct syscallentry *entry = syscalls_32bit[i].entry;

					if (entry == NULL)
						continue;

					if (entry->active_number != 0)
						deactivate_syscall_nolock(i, true);
				}
				/* The per-call deactivate path has already cleared the
				 * cached validity bit when nr_active hit zero; pin it
				 * here so the auto-disable point is self-evidently
				 * coherent even if the loop above ever exits early. */
				__atomic_store_n(&shm->valid_syscall_table_32, false, __ATOMIC_RELAXED);
				did_disable = true;
			}

			unlock(&shm->syscalltable_lock);

			if (did_disable)
				output(0, "Tried %d 32-bit syscalls unsuccessfully. Disabling all 32-bit syscalls.\n",
						snap_attempted);
		}

		__atomic_add_fetch(&shm->syscalls32_attempted, 1, __ATOMIC_RELAXED);
	}

	DO_32_SYSCALL

	if ((unsigned long)(__res) >= (unsigned long)(-133)) {
		errno = -(__res);
		__res = -1;
	}

	__atomic_store_n(&shm->syscalls32_succeeded, true, __ATOMIC_RELAXED);

#else
	#error Implement 32-on-64 syscall macro for this architecture.
#endif
	return __res;
}
#else
#define syscall32(a,b,c,d,e,f,g) 0
#endif /* ARCH_IS_BIARCH */

/*
 * Maybe arm /proc/self/fail-nth so the next syscall sees an allocation
 * failure on its Nth slab/page alloc.  Returns true if we wrote a value.
 *
 * We deliberately do this *here*, after all sanitise_*() and arg-generation
 * has happened, so the fault hits the kernel's path through the syscall
 * itself rather than any of trinity's setup allocations.
 *
 * Skip on the EXTRA_FORK throwaway path (state == GOING_AWAY): the
 * grandchild inherits the fd, but the file inode refers to the opener's
 * (i.e. parent child's) task — writing through it would arm fault
 * injection on the *parent*'s next syscall, not the grandchild's.
 */
static bool maybe_inject_fault(struct childdata *child, enum syscallstate state)
{
	char buf[16];
	int n, len;

	if (child == NULL || child->fail_nth_fd == -1)
		return false;

	if (state != BEFORE)
		return false;

	if (!ONE_IN(20))
		return false;

	n = RAND_RANGE(1, 8);
	len = snprintf(buf, sizeof(buf), "%d", n);

	if (write(child->fail_nth_fd, buf, (size_t)len) != len)
		return false;

	return true;
}

static void child_watchdog_evict_fd(int fd, void *ctx)
{
	struct childdata *child = ctx;

	if (child->fd_event_ring != NULL)
		fd_event_enqueue(child->fd_event_ring, FD_EVENT_EVICT, fd);
}

static void __do_syscall(struct syscallrecord *rec, struct syscallentry *entry,
			 enum syscallstate state,
			 struct kcov_child *kc, struct childdata *child)
{
	unsigned long ret = 0;
	unsigned long a1, a2, a3, a4, a5, a6;
	bool fault_armed = false;
	int saved_errno = 0;
	int call;
	bool needalarm;

	errno = 0;

	call = rec->nr + SYSCALL_OFFSET;
	needalarm = entry->flags & NEED_ALARM;

	srec_publish_begin(rec);
	__atomic_store_n(&rec->state, state, __ATOMIC_RELAXED);
	/* Stamp the wholesale-stomp canary just before dispatch so
	 * handle_syscall_ret() can tell whether anything overwrote
	 * the rec while the kernel had control.  One store on the hot
	 * path; the matching load is paired with the AFTER snapshot
	 * read inside the post handler. */
	rec->_canary = REC_CANARY_MAGIC;
	srec_publish_end(rec);

	/* Second blanket_address_scrub() pass, post-publish_end and
	 * pre-snapshot: closes the sibling-stomp window between the
	 * sanitise-time scrub at the tail of generate_syscall_args() and
	 * the local snapshot below.  Same range-aware predicate and same
	 * address_scrub_mask (honouring SKIP_BLANKET_SCRUB) as the first
	 * pass — only the timing moves. */
	blanket_address_scrub(entry, rec);

	/* Snapshot the argument slots before dispatch.  rec lives in
	 * shared memory and a sibling child can stomp rec->aN mid-flight
	 * (the per-arg snapshot pattern in .post handlers exists for
	 * exactly this reason).  We send the snapshots to the kernel
	 * and re-read them from the locals in the watchdog eviction
	 * block below so a sibling stomp between syscall return and the
	 * eviction read cannot redirect us to a fabricated fd value. */
	a1 = rec->a1;
	a2 = rec->a2;
	a3 = rec->a3;
	a4 = rec->a4;
	a5 = rec->a5;
	a6 = rec->a6;

	/* Populate rec->arg_shadow[] from the local a1..a6 about to be
	 * passed to the kernel, so opted-in post handlers reading via
	 * get_arg_snapshot() see exactly what the kernel saw.  Captured
	 * here -- after the second blanket_address_scrub above and from
	 * the locals (immune to a sibling stomp between BEFORE and AFTER)
	 * -- rather than at the tail of generate_syscall_args(): a sibling
	 * stomp between sanitise-time and dispatch-time used to leave the
	 * shadow holding a stale pre-stomp value while the kernel saw the
	 * stomped one, and get_arg_snapshot()'s tripwire bump for the
	 * mismatch was the only signal; the post handler still consumed
	 * the stale shadow.  Capturing from the locals collapses the
	 * window: the only stomp the shadow can now miss is one that
	 * lands after dispatch began, which IS the bug class
	 * arg_shadow_stomp is meant to surface. */
	{
		uint8_t mask = entry->arg_snapshot_mask;

		rec->arg_snapshot_mask = mask;
		while (mask != 0) {
			unsigned int i = (unsigned int)__builtin_ctz(mask);
			unsigned long val;

			switch (i + 1) {
			case 1: val = a1; break;
			case 2: val = a2; break;
			case 3: val = a3; break;
			case 4: val = a4; break;
			case 5: val = a5; break;
			case 6: val = a6; break;
			default: val = 0; break;
			}
			rec->arg_shadow[i] = val;
			mask &= (uint8_t)(mask - 1);
		}
	}

	/* Cross-arg consistency check: catch (buf_ptr, count) pairs the
	 * kernel would reject at its earliest validation step so we
	 * don't burn a syscall round-trip and a kcov enable/disable on
	 * a call that can't exercise an interesting path.  On rejection
	 * synthesize a -1/EINVAL AFTER state so handle_syscall_ret()
	 * accounts the rejection identically to a real early-EINVAL
	 * failure (no separate stats infrastructure to maintain).  Zero
	 * the kcov trace count header manually because kcov_enable_trace
	 * (which usually owns that zeroing) never runs on the skip path
	 * and the caller's kcov_collect() would otherwise re-process the
	 * previous syscall's PCs against this slot. */
	if (validate_arg_coupling(rec) != 0) {
		post_handler_corrupt_ptr_bump(rec, NULL);
		if (kc != NULL && kc->active) {
			if (kc->mode == KCOV_MODE_PC && kc->trace_buf != NULL)
				__atomic_store_n(&kc->trace_buf[0], 0,
						 __ATOMIC_RELAXED);
			else if (kc->mode == KCOV_MODE_CMP &&
				 kc->cmp_trace_buf != NULL)
				__atomic_store_n(&kc->cmp_trace_buf[0], 0,
						 __ATOMIC_RELAXED);
		}
		srec_publish_begin(rec);
		rec->errno_post = EINVAL;
		rec->retval = (unsigned long) -1L;
		rec->validator_rejected = true;
		__atomic_store_n(&rec->state, AFTER, __ATOMIC_RELEASE);
		srec_publish_end(rec);
		return;
	}

	/*
	 * --dry-run: run the full argument-generation/sanitise pipeline
	 * (already complete by the time we reach here) and the post
	 * handlers, but never execute the syscall.  Synthesize a -1/ENOSYS
	 * AFTER state so the post path accounts it as an early failure --
	 * handle_failure() runs for coverage while the success-gated
	 * registrars (handle_success, register_returned_fd, prop_ring_push)
	 * and entry->post all short-circuit on retval == -1UL, issuing no
	 * syscall of their own.  deactivate_enosys() is skipped for dry-run
	 * at its call site so the synthetic ENOSYS does not drain the
	 * syscall table.  Zero the kcov trace header manually (kcov_enable
	 * never ran on this skip path) so the caller's kcov_collect() does
	 * not re-process the previous syscall's PCs -- mirroring the
	 * validate_arg_coupling() reject above.  Lets ASAN drive the
	 * generators on any host without firing a fuzzed syscall.
	 */
	if (dry_run) {
		if (kc != NULL && kc->active) {
			if (kc->mode == KCOV_MODE_PC && kc->trace_buf != NULL)
				__atomic_store_n(&kc->trace_buf[0], 0,
						 __ATOMIC_RELAXED);
			else if (kc->mode == KCOV_MODE_CMP &&
				 kc->cmp_trace_buf != NULL)
				__atomic_store_n(&kc->cmp_trace_buf[0], 0,
						 __ATOMIC_RELAXED);
		}
		srec_publish_begin(rec);
		rec->errno_post = ENOSYS;
		rec->retval = (unsigned long) -1L;
		__atomic_store_n(&rec->state, AFTER, __ATOMIC_RELEASE);
		srec_publish_end(rec);
		return;
	}

	/* Arm the alarm after the publish-end above.  alarm(1) used to
	 * sit above the rec->lock region, opening a window where
	 * SIGALRM could fire while the lock was held; the siglongjmp
	 * in the handler would then orphan it.  The publish brackets
	 * now stand in for the lock as the ordering anchor, but the
	 * alarm-after-publish ordering is preserved on the same
	 * grounds. */
	if (needalarm)
		(void)alarm(1);

	/* Per-child mode picked once in kcov_init_child: PC-mode children
	 * enable the PC fd (per-thread or remote) and feed edge coverage,
	 * CMP-mode children enable the cmp fd and feed comparison-operand
	 * hints.  Exactly one fd is enabled per syscall because the kernel's
	 * one-`t->kcov`-per-task rule returns -EBUSY on a second simultaneous
	 * enable; the fleet-wide PC/CMP signal split comes from the
	 * population mix instead of per-call mode toggling. */
	if (rec->do32bit == false) {
		if (kc != NULL && kc->mode == KCOV_MODE_CMP) {
			kcov_enable_cmp(kc);
		} else if (kc != NULL && kc->remote_mode) {
			kcov_enable_remote(kc, child != NULL ? child->num : 0);
		} else {
			kcov_enable_trace(kc);
		}
		fault_armed = maybe_inject_fault(child, state);
		ret = syscall(call, a1, a2, a3, a4, a5, a6);
		saved_errno = errno;
		kcov_disable(kc);
	} else {
		if (kc != NULL && kc->mode == KCOV_MODE_CMP) {
			kcov_enable_cmp(kc);
		} else if (kc != NULL && kc->remote_mode) {
			kcov_enable_remote(kc, child != NULL ? child->num : 0);
		} else {
			kcov_enable_trace(kc);
		}
		fault_armed = maybe_inject_fault(child, state);
		ret = syscall32(call, a1, a2, a3, a4, a5, a6);
		saved_errno = errno;
		kcov_disable(kc);
	}

	/* fail-nth resets to 0 in the kernel after the syscall completes.
	 * Tally whether the armed fault actually triggered (-ENOMEM) vs
	 * went unconsumed (the syscall didn't reach an allocation we hit). */
	if (fault_armed) {
		if (child != NULL) {
			stats_ring_enqueue(child->stats_ring,
					   STATS_FIELD_FAULT_INJECTED, 0, 1);
			if (ret == (unsigned long)-1L && saved_errno == ENOMEM)
				stats_ring_enqueue(child->stats_ring,
						   STATS_FIELD_FAULT_CONSUMED,
						   0, 1);
		} else {
			parent_stats.fault_injected++;
			if (ret == (unsigned long)-1L && saved_errno == ENOMEM)
				parent_stats.fault_consumed++;
		}
	}

	/* If we became tainted, get out as fast as we can. */
	if (is_tainted() == true) {
		panic(EXIT_KERNEL_TAINTED);
		_exit(EXIT_KERNEL_TAINTED);
	}

	if (needalarm)
		(void)alarm(0);

	/* In-child watchdog eviction window.  The 1s alarm above bounds
	 * how long the kernel can hold us inside a single syscall; on
	 * fire it interrupts the syscall with EINTR and the handler in
	 * signals.c sets sigalrm_pending.  We do the fd-eviction work
	 * HERE -- after the syscall has returned and alarm(0) has
	 * disarmed, but BEFORE the lock region below publishes state =
	 * AFTER -- rather than from the signal handler (async-signal-
	 * unsafe to walk fd_event_ring there) or from the child main
	 * loop's sigalrm_pending branch (which the BEFORE -> AFTER
	 * transition would otherwise race past, leaving the eviction
	 * unreachable).  The conjunction below is the conservative
	 * "our watchdog actually fired on a blocking syscall" predicate:
	 * sigalrm_pending alone can be set by any fuzzed SIGALRM source,
	 * but the combination of our own alarm being armed, the syscall
	 * returning EINTR, and the child running a normal syscall op is
	 * specific to the watchdog path. */
	if (needalarm && sigalrm_pending &&
	    ret == (unsigned long)-1L && saved_errno == EINTR &&
	    child != NULL && child->op_type == CHILD_OP_SYSCALL) {
		/* Gate the bookkeeping on "the syscall has fd-bearing arg
		 * slots", matching the slot-set for_each_fd_arg() will walk
		 * (fd_arg_mask plus the ARG_SOCKETINFO-in-slot-0 mirror).
		 * Bump stats and reset fd_lifetime once per stuck-syscall
		 * event, regardless of how many of those args' raw values
		 * actually pass the rlimit check inside the walk. */
		uint8_t gate = entry->fd_arg_mask;
		if (entry->argtype[0] == ARG_SOCKETINFO)
			gate |= 0x01;

		if (gate != 0) {
			unsigned long args[6] = { a1, a2, a3, a4, a5, a6 };

			child->fd_lifetime = 0;

			stats_ring_enqueue(child->stats_ring,
					   STATS_FIELD_WATCHDOG_FD_EVICT,
					   0, 1);

			for_each_fd_arg(entry, args,
					child_watchdog_evict_fd, child);
		}

		/* Eviction handled here; clear the pending flag so the child
		 * main loop's sigalrm_pending branch sees a no-op for this
		 * SIGALRM.  The housekeeping there (alarm(0) and the same
		 * pending clear) still covers other op_type paths and races
		 * where the flag is set outside this dispatch window. */
		sigalrm_pending = 0;
	}

	srec_publish_begin(rec);
	rec->errno_post = saved_errno;
	rec->retval = ret;
	__atomic_store_n(&rec->state, AFTER, __ATOMIC_RELEASE);
	srec_publish_end(rec);
}

/* This is a special case for things like execve, which would replace our
 * child process with something unknown to us. We use a 'throwaway' process
 * to do the execve in, and let it run for a max of a second before we kill it
 */
static void do_extrafork(struct syscallrecord *rec, struct syscallentry *entry,
			 struct childdata *child)
{
	pid_t pid = 0;
	pid_t extrapid;

#ifdef __SANITIZE_ADDRESS__
	/* ASAN's __asan_handle_no_return runs at the fork/exec boundary
	 * and trips a CHECK in PoisonShadow when called from this path
	 * (PlatformUnpoisonStacks receives bogus stack bounds, aborts
	 * with "AddrIsAlignedByGranularity != 0").  Downstream EAGAIN
	 * mmap failures in the grandchild's ASAN allocator follow from
	 * the same CLONE_VM-shared-address-space state.  Skip the extra
	 * fork on sanitizer builds; the regular fuzz path stays. */
	(void)rec; (void)entry; (void)child;
	return;
#endif

	extrapid = fork();
	if (extrapid == 0) {
		/* grand-child */
		char childname[]="trinity-subchild";
		prctl(PR_SET_NAME, (unsigned long) &childname);

		/*
		 * Flag ourselves so child_fault_handler() skips the fault
		 * beacon stamp on a grand-child crash.  this_child() in the
		 * grand-child returns the parent worker's childdata (cached
		 * via COW-inherited cached_pid that no one updated across
		 * this fork), so without the gate a SIGSEGV here would mis-
		 * attribute the death to the parent worker and retire it.
		 * Set before __do_syscall so any synchronous fault inside
		 * the throwaway syscall is covered.
		 */
		in_extrafork_grandchild = 1;

		__do_syscall(rec, entry, GOING_AWAY, NULL, child);
		/* if this was for eg. an successful execve, we should never get here.
		 * if it failed though... */
		_exit(EXIT_SUCCESS);
	}

	/* misc failure. */
	if (extrapid == -1) {
		/* Parent already allocated snap in sanitise; post handler will
		 * not run because state never reaches AFTER. Free snap here. */
		if (entry->post != NULL)
			entry->post(rec);
		return;
	}

	/* small pause to let grandchild do some work. */
	if (pid_alive(extrapid) == true)
		usleep(100);

	/* Bound the loop to ~1 second (1000 * 1ms) so a D-state
	 * grandchild can't stall us forever.
	 */
	for (int i = 0; pid == 0 && i < 1000; i++) {
		int childstatus;

		pid = waitpid(extrapid, &childstatus, WUNTRACED | WCONTINUED | WNOHANG);
		if (pid < 0 && errno == EINTR)
			pid = 0;	/* transient, keep retrying within the budget */
		usleep(1000);
	}

	/* Timed out, or waitpid errored. Force-kill and reap to prevent zombies. */
	if (pid <= 0) {
		kill(extrapid, SIGKILL);
		waitpid(extrapid, NULL, 0);
	}

	/* Grandchild died before reaching __do_syscall's AFTER block, so
	 * handle_syscall_ret will skip entry->post (state != AFTER gate).
	 * The parent-side allocations referenced by rec->post_state would
	 * otherwise leak onto this worker's heap on every grandchild
	 * timeout (~254 KiB worst case for execve / execveat). Invoke
	 * entry->post here so it frees post_state.
	 *
	 * Safe because the only EXTRA_FORK syscalls with a post handler
	 * today are execve and execveat, both of which inspect
	 * rec->post_state exclusively (no dependency on rec->retval /
	 * errno_post / state). Any future EXTRA_FORK syscall whose post
	 * handler reads those fields must gate them on state == AFTER
	 * itself.
	 *
	 * No lock: grandchild was SIGKILL'd and reaped, no contender. */
	if (__atomic_load_n(&rec->state, __ATOMIC_RELAXED) != AFTER &&
	    entry->post != NULL)
		entry->post(rec);
}


void generic_post_close_fd(struct syscallrecord *rec)
{
	long ret = (long)rec->retval;
	if (ret >= 0 && ret < (1 << 20))
		close((int)ret);
}

/*
 * Blanket retval bound for RET_FD handlers at the do_syscall layer.
 * Complements the add_object()-side check: that gate fires only on
 * RET_FD entries that declare a ret_objtype and reach the universal
 * pool-registration chokepoint.  Roughly 19 RET_FD entries instead
 * carry bespoke .post handlers that consume the returned fd without
 * ever calling add_object() -- the generic_post_close_fd users
 * (signalfd, signalfd4, fsmount, open_tree, open_tree_attr,
 * memfd_secret, pidfd_getfd), perf_event_open's close-on-fail path,
 * futex(FUTEX_FD) (which has no retval check at all), and a handful
 * of others.  Without a chokepoint at this layer a wholesale-stomped
 * or upper-bit-corrupt rec->retval whose lower bits happen to be
 * positive slips past the "(long)retval >= 0" gates these handlers
 * use and is fed straight back to the kernel as a real fd by close()
 * (or worse, lands on a file-table entry an unrelated path opened).
 *
 * 1<<20 = 1048576 matches the kernel's NR_OPEN ceiling
 * (include/uapi/linux/fs.h), the absolute upper bound RLIMIT_NOFILE
 * may be raised to on every distro we exercise.  No legitimate RET_FD
 * handler treats an out-of-range value as anything but a kernel ABI
 * violation, so the validator firing IS the bug report.
 *
 * Read rec->rettype rather than entry->rettype: fcntl(F_DUPFD /
 * F_DUPFD_CLOEXEC) and futex(FUTEX_FD) only set RET_FD on the rec at
 * sanitise time; their syscallentries advertise something else.
 *
 * On rejection, coerce rec->retval = -1UL and rec->errno_post =
 * EINVAL.  Every existing .post handler short-circuits on
 * (long)retval < 0, register_returned_fd() likewise skips the < 0
 * branch, so the coerced shape papers over the corruption for all
 * downstream consumers in one place.  Sub-attribution by syscall
 * routes through post_handler_corrupt_ptr_bump's per-handler ring
 * via the rec it's passed; the _dispatch wrapper additionally feeds
 * this site's caller PC into the per-PC ring so the dump can tell
 * blanket-validator rejections of a syscall apart from that same
 * syscall's own .post handler rejections.
 */
static bool reject_corrupt_retfd(const struct syscallentry *entry,
				 struct syscallrecord *rec)
{
	long s;

	if (rec->rettype != RET_FD)
		return false;

	/* -1UL is the legitimate failure value; handle_failure path. */
	if (rec->retval == -1UL)
		return false;

	s = (long)rec->retval;
	if (s >= 0 && s < (1L << 20))
		return false;

	outputerr("retfd: rejecting out-of-bound retval=0x%lx for %s\n",
		  rec->retval, entry->name);
	post_handler_corrupt_ptr_bump_retfd(rec);
	rec->retval = -1UL;
	rec->errno_post = EINVAL;
	return true;
}

/*
 * Blanket count-bound validator for syscalls whose retval semantics are
 * exactly "bytes/items processed in [0, aN] || -1", driven by the
 * .bound_arg annotation on syscallentry.  Single dispatcher chokepoint
 * means we don't have to sprinkle the same per-syscall .post bound check
 * across every read/write/recv/send-class handler individually -- one
 * gate covers the entire helper-eligible set, and adding a new entry to
 * the set is a one-line .bound_arg = N annotation.
 *
 * Read the count from rec->aN at validator entry rather than from a
 * post_state snapshot: the validator runs before entry->post, so the
 * snap-stash pattern that defends per-syscall post handlers against
 * sibling-stomps of rec->aN is not yet in scope.  Per-syscall .post
 * handlers that already keep a snap-bounded copy (write/listmount/
 * readlink/getcwd etc.) remain in place as a defense-in-depth second
 * layer; this helper catches the symmetric set that has no .post today
 * (read/pread64/recv/sendto/...) for the same logical bug class.
 *
 * Informational only -- do NOT coerce rec->retval.  Unlike the RET_FD
 * blanket validator, an over-large count-bound retval does not seed a
 * downstream wild-write hazard: nobody passes the retval back to the
 * kernel as a buffer length or fd.  The cost of a mis-coerced retval
 * (silently dropping a legitimate large read on a machine whose ulimit
 * raises the bound past the helper's expectation) outweighs the value
 * for a Phase 2 detector.  Coercion is reserved for a follow-up phase
 * once the helper has accumulated quiet-week telemetry.
 *
 * Skip rec->retval == -1UL: failure is the legitimate error path and
 * carries no count semantics.
 */
static void enforce_count_bound(const struct syscallentry *entry,
				struct syscallrecord *rec)
{
	int idx = entry->bound_arg;
	unsigned long count;
	unsigned long ret;

	if (idx == 0)
		return;

	if (rec->retval == -1UL)
		return;

	if (idx < 1 || idx > 6)
		return;

	/* Read via get_arg_snapshot() so a bound_arg slot that opted into
	 * the arg_shadow mask is compared against the dispatch-time value
	 * the kernel actually saw -- a sibling stomping rec->aN between
	 * syscall return and this check would otherwise either fabricate a
	 * spurious "retval exceeds count" warning or hide a real one by
	 * inflating the bound.  Unopted slots fall through the accessor's
	 * mask gate to the live rec->aN, matching the pre-change behaviour. */
	count = get_arg_snapshot(rec, (unsigned int) idx);

	ret = rec->retval;
	if (ret > count) {
		outputerr("count-bound: %s retval=%lu exceeds %s=%lu\n",
			  entry->name, ret,
			  entry->argname[idx - 1] ? entry->argname[idx - 1] : "count",
			  count);
		post_handler_corrupt_ptr_bump(rec, NULL);
	}
}

/*
 * Table-driven generic return-bound validator.  Complementary to the
 * bespoke rettype gates above (rzs_blanket_reject, reject_corrupt_retfd,
 * enforce_count_bound), this catches the residual RET_* classes whose
 * value range is well-defined by kernel ABI but had no dispatcher-level
 * check.  Entries left .active = false (RET_FD, RET_ADDRESS, the
 * unlisted indices) are skipped: RET_FD is already coerced to -1UL by
 * reject_corrupt_retfd before this runs, so an entry here would be dead;
 * RET_ADDRESS spans the full address space and has no useful generic
 * bound.  RET_ZERO_SUCCESS IS included even though rzs_blanket_reject
 * already bumps a stat counter for it -- the counter is silent, and
 * adding the entry surfaces the per-syscall offender at -v.
 *
 * Informational only -- does not coerce rec->retval.  Skips the universal
 * -1UL error path and any rettype outside [RET_ZERO_SUCCESS, RET_LAST].
 * Logged via output(1, ...) so it stays quiet at the default verbosity
 * and only fires for an operator running with -v.
 */
struct ret_bound {
	long min, max;
	bool active;
};

static const struct ret_bound ret_bounds[RET_LAST + 1] = {
	[RET_ZERO_SUCCESS] = { 0,         0,         true },
	[RET_KEY_SERIAL_T] = { 1,         INT32_MAX, true },
	[RET_PID_T]        = { 0,         4194304,   true },  /* PID_MAX_LIMIT */
	[RET_PATH]         = { 0,         PATH_MAX,  true },
	[RET_NUM_BYTES]    = { 0,         LONG_MAX,  true },  /* ssize_t domain */
	[RET_GID_T]        = { 0,         INT32_MAX, true },
	[RET_UID_T]        = { 0,         INT32_MAX, true },
};

static void validate_ret_bound(const struct syscallentry *entry,
			       struct syscallrecord *rec)
{
	const struct ret_bound *b;
	int rt = rec->rettype;
	long s;

	if (rt <= RET_NONE || rt > RET_LAST)
		return;
	b = &ret_bounds[rt];
	if (!b->active)
		return;
	if (rec->retval == -1UL)
		return;

	s = (long) rec->retval;
	if (s < b->min || s > b->max)
		output(1, "ret-bound: %s rettype=%d retval=%ld outside [%ld, %ld]\n",
		       entry->name, rt, s, b->min, b->max);
}

/*
 * Generic post-hook: register the fd returned by an annotated syscall
 * into its typed OBJ_LOCAL pool.  Runs after entry->post so a
 * syscall-specific handler that already registered the fd (and possibly
 * stored extra metadata like socket triplet, eventfd count, etc.)
 * stays authoritative; we only fill in what nobody else tracked.
 */
static void register_returned_fd(const struct syscallentry *entry,
				 struct syscallrecord *rec)
{
	enum objecttype type = entry->ret_objtype;
	struct object *obj;
	int fd;

	if (type == OBJ_NONE)
		return;
	if ((long)rec->retval < 0)
		return;

	/* Non-fd object kinds (e.g. OBJ_KEY_SERIAL) hand off to a
	 * type-specific registrar — the fd-keyed logic below assumes
	 * an OBJ_FD_* layout (set_object_fd / find_local_object_by_fd
	 * walk fd union members) and would be a no-op otherwise. */
	if (type == OBJ_KEY_SERIAL) {
		long s = (long) rec->retval;

		if (s <= 0 || s > INT32_MAX)
			return;
		register_key_serial((int32_t) s);
		return;
	}

	if (type == OBJ_PID) {
		long p = (long) rec->retval;

		/* fork/vfork/clone parent-side success: a child pid in
		 * [1, PID_MAX_LIMIT=4194304].  Reject 0 (clone child branch
		 * already rerouted by the per-syscall .post handler that
		 * _exit's before reaching here, but defence-in-depth) and
		 * anything past the kernel's pid_max ceiling -- the latter
		 * is the corrupted-retval shape the per-syscall .post oracles
		 * already log via post_handler_corrupt_ptr_bump. */
		if (p <= 0 || p > 4194304)
			return;
		register_returned_pid((pid_t) p);
		return;
	}

	fd = (int)rec->retval;
	if (fd <= 2) {
		__atomic_add_fetch(&shm->stats.fd_runtime_skipped_stdio, 1,
				   __ATOMIC_RELAXED);
		return;
	}

	if (find_local_object_by_fd(type, fd) != NULL) {
		__atomic_add_fetch(
			&shm->stats.fd_runtime_skipped_already_registered, 1,
			__ATOMIC_RELAXED);
		return;
	}

	obj = alloc_object();
	set_object_fd(obj, type, fd);
	add_object(obj, OBJ_LOCAL, type);

	__atomic_add_fetch(&shm->stats.fd_runtime_registered, 1,
			   __ATOMIC_RELAXED);
}

void do_syscall(struct syscallrecord *rec, struct syscallentry *entry,
		struct kcov_child *kc, struct childdata *child)
{
	/* Arm the self-fuzzed-fatal-signal gate in child_fault_handler.
	 * While set, an own-pid SI_USER/SI_TKILL/SI_QUEUE delivery of
	 * SIGSEGV/SIGBUS/SIGILL/SIGABRT is treated as fuzzer noise (the
	 * child just executed kill/tkill/tgkill/rt_sigqueueinfo/
	 * pidfd_send_signal at itself) and the child exits silently
	 * instead of pouring a bug log into /tmp/.  See signals.c. */
	in_do_syscall = 1;

	if (entry->flags & EXTRA_FORK)
		do_extrafork(rec, entry, child);
	else
		 /* common-case, do the syscall in this child process. */
		__do_syscall(rec, entry, BEFORE, kc, child);

	in_do_syscall = 0;

	/* Reuse the iteration-start timestamp child->tp captured at the top
	 * of random_syscall_step() rather than calling clock_gettime() again.
	 * rec->tp's consumers (taint timestamp ordering in post-mortem, and
	 * pre_crash_ring entry timestamps) only need second-level granularity
	 * for crash attribution — paying for a second clock read per syscall
	 * was pure overhead in the hot path. */
	rec->tp = child->tp;
}

/*
 * If the syscall doesn't exist don't bother calling it next time.
 * Some syscalls return ENOSYS depending on their arguments, we mark
 * those as IGNORE_ENOSYS and keep calling them.
 */
static void deactivate_enosys(struct syscallrecord *rec, struct syscallentry *entry, unsigned int call)
{
	bool did_deactivate = false;

	/* some syscalls return ENOSYS instead of EINVAL etc (futex for eg) */
	if (entry->flags & IGNORE_ENOSYS)
		return;

	lock(&shm->syscalltable_lock);

	/* check another thread didn't already do this. */
	if (entry->active_number != 0) {
		deactivate_syscall_nolock(call, rec->do32bit);
		did_deactivate = true;
	}

	unlock(&shm->syscalltable_lock);

	if (did_deactivate)
		output(0, "%s (%d%s) returned ENOSYS, marking as inactive.\n",
			entry->name,
			call + SYSCALL_OFFSET,
			rec->do32bit == true ? ":[32BIT]" : "");
}

/*
 * Rate-limited (at most once per second per child) WARNING for canary
 * mismatches.  A wholesale stomp from a sibling syscall can land on
 * many recs in quick succession; without throttling the log floods.
 * Per-process static is fine — one storm from one child is interesting,
 * the second sample from the same child within a second adds nothing.
 */
static void canary_stomp_warn_ratelimited(const struct syscallentry *entry,
					  uint64_t observed)
{
	static struct timespec last_warn;
	struct timespec now;

	clock_gettime(CLOCK_MONOTONIC, &now);
	if (now.tv_sec == last_warn.tv_sec)
		return;
	last_warn = now;

	outputerr("WARNING: rec canary stomped during %s: observed=0x%lx (expected 0x%lx) -- syscallrecord wholesale-clobbered between BEFORE and AFTER\n",
		  entry->name, (unsigned long) observed,
		  (unsigned long) REC_CANARY_MAGIC);
}

/*
 * Rate-limited (at most once per second per child) WARNING for stale
 * arena-pointer detections.  A single munmap storm from one sibling can
 * fire the probe on many syscalls in quick succession; mirror the
 * canary_stomp_warn_ratelimited cadence so the log stays useful rather
 * than flooding.  Per-process static; the headline counter still
 * accumulates every detection.
 */
static void arena_stale_warn_ratelimited(const struct syscallentry *entry,
					 const char *site, unsigned long v)
{
	static struct timespec last_warn;
	struct timespec now;

	clock_gettime(CLOCK_MONOTONIC, &now);
	if (now.tv_sec == last_warn.tv_sec)
		return;
	last_warn = now;

	outputerr("WARNING: arena_ptr_stale caught during %s [%s]: v=0x%lx (page-aligned, in arena band, no live tracker)\n",
		  entry->name, site, v);
}

/*
 * Classify v as live (tracked) / stale (page-aligned arena-band shape
 * with no live tracker) / unknown (out of scope for this probe).
 * Telemetry-only -- callers bump a counter on STALE but do not coerce
 * the slot or skip the post handler.
 *
 * Ordering rationale (matches spec §3):
 *   1. is_corrupt_ptr_shape() -> UNKNOWN: defer to the existing shape
 *      gate; double-firing would double-count the structurally-broken
 *      class under both counters.
 *   2. range_in_tracked_shared() -> LIVE: linear walk of
 *      shared_regions[] + overflow, no LRU window.
 *   3. addr_in_local_runtime_map() -> LIVE: walk of OBJ_LOCAL
 *      OBJ_MMAP_{ANON,FILE,TESTFILE} pools, no LRU window.
 *   4. page-aligned AND inside the literal arena band -> STALE.
 *   5. Anything else -> UNKNOWN (a runtime CHILD_ANON above the band
 *      lands here; out of scope for the literal-band Phase 1).
 */
enum arena_ptr_status {
	ARENA_PTR_LIVE,
	ARENA_PTR_STALE,
	ARENA_PTR_UNKNOWN,
};

static enum arena_ptr_status arena_ptr_liveness(unsigned long v, size_t need)
{
	if (is_corrupt_ptr_shape((const void *) v))
		return ARENA_PTR_UNKNOWN;
	if (range_in_tracked_shared(v, need))
		return ARENA_PTR_LIVE;
	if (addr_in_local_runtime_map(v, need))
		return ARENA_PTR_LIVE;
	if ((v & ((unsigned long) page_size - 1)) == 0 && is_in_arena_band(v))
		return ARENA_PTR_STALE;
	return ARENA_PTR_UNKNOWN;
}

/*
 * Dispatcher-level liveness probe.  Walks the ARG_ADDRESS /
 * ARG_NON_NULL_ADDRESS slots and the rec->post_state tail looking for
 * page-aligned arena-band pointers that no live tracker owns -- the
 * structural shape of the bug 1279961 SEGV at handle_syscall_ret+0x24a
 * (si_addr=0x4037e000) which is_corrupt_ptr_shape() by design admits.
 *
 * Telemetry-only.  Runs AFTER the kernel has returned, so the kernel
 * has already observed whatever value sat in the slot; coercing it now
 * cannot influence the syscall and would itself be a post-dispatch
 * scribble of the shared rec -- exactly the class of bug the wider
 * arg_shadow / canary machinery is meant to surface.  On detection we
 * bump the headline counter, rate-limited warn, and return; downstream
 * consumers must take their own EXPLICIT skip path on a stale slot
 * rather than rely on this probe to coerce.
 */
static void arena_liveness_probe(struct syscallentry *entry,
				 struct syscallrecord *rec)
{
	size_t need = (size_t) page_size;
	unsigned int i;

	for_each_arg(entry, i) {
		enum argtype t = entry->argtype[i - 1];
		unsigned long slot;

		if (t != ARG_ADDRESS && t != ARG_NON_NULL_ADDRESS)
			continue;

		switch (i) {
		case 1: slot = rec->a1; break;
		case 2: slot = rec->a2; break;
		case 3: slot = rec->a3; break;
		case 4: slot = rec->a4; break;
		case 5: slot = rec->a5; break;
		case 6: slot = rec->a6; break;
		default: continue;
		}

		if (arena_ptr_liveness(slot, need) != ARENA_PTR_STALE)
			continue;

		__atomic_add_fetch(&shm->stats.arena_ptr_stale_caught_arg,
				   1, __ATOMIC_RELAXED);
		arena_stale_warn_ratelimited(entry, "arg", slot);
	}

	if (rec->post_state != 0 &&
	    arena_ptr_liveness(rec->post_state, need) == ARENA_PTR_STALE) {
		__atomic_add_fetch(&shm->stats.arena_ptr_stale_caught_post_state,
				   1, __ATOMIC_RELAXED);
		arena_stale_warn_ratelimited(entry, "post_state",
					     rec->post_state);
	}
}

void handle_syscall_ret(struct syscallrecord *rec, struct syscallentry *entry)
{
	unsigned int call = rec->nr;
	bool retfd_rejected;
	bool rzs_rejected = false;

	/* Wholesale-stomp check: if anything overwrote the rec while the
	 * kernel had control, the canary won't match.  Catches the rarer
	 * class the per-arg snapshot pattern can't shadow (bookkeeping
	 * fields, the whole struct alias-clobbered by a sibling
	 * value-result write).  Informational — the call has already
	 * returned and downstream guards (post_handler_corrupt_ptr, the