From c5427dbdaf8341202e61924a1ca0b3c8bb13cc6a Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 23 Jul 2024 20:31:38 -0400 Subject: [PATCH 1/3] caprevoke: Make the skip_fault optimization smarter If, when scanning, we reach a VA that is unmapped and corresponds to a non-resident page, and the VM object is swap-backed and has no swap blocks assigned, we skip to the next resident page. Extend this optimization to the case where the object has swap blocks assigned: the in-memory tracking structure for a swap block includes a bitmap of all the capability tags, so when skipping we can search for the closer of - the next resident page - the next page backed by a swap block that has at least one capability tag. --- sys/vm/swap_pager.c | 49 ++++++++++++++++++++++++++++++++++++++++ sys/vm/swap_pager.h | 6 +++++ sys/vm/vm_cheri_revoke.c | 26 ++++++++++----------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 60308ba6d7ca..3127d6c55bd4 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1101,6 +1101,16 @@ swp_pager_cheri_xfer_tags(vm_object_t dstobject, vm_pindex_t pindex, modpi = pindex % SWAP_META_PAGES; memcpy(&dstsb->t[modpi], &sb->t[srcmodpi], sizeof(sb->t[srcmodpi])); } + +static bool +swp_pager_cheri_has_tags(struct swblk *sb, int p) +{ + for (int i = 0; i < TAG_WORDS_PER_PAGE; i++) { + if (sb->t[p][i] != 0) + return (true); + } + return (false); +} #endif static bool @@ -3341,6 +3351,45 @@ swap_pager_release_writecount(vm_object_t object, vm_offset_t start, object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } + +#ifdef CHERI_CAPREVOKE +/* + * Return the smallest pindex greater than or equal to "pindex" for which at + * least one of the following applies: + * 1) there is a resident page + * 2) there is a swap block with at least one capability tag. + * During a revocation scan, there is no need to page in until that point. + */ +vm_pindex_t +swap_pager_cheri_revoke_next(vm_object_t object, vm_pindex_t pindex) +{ + vm_page_t m_next; + struct swblk *sb; + vm_pindex_t respindex, swpindex; + + VM_OBJECT_ASSERT_LOCKED(object); + + m_next = vm_page_find_least(object, pindex); + respindex = m_next != NULL ? m_next->pindex : object->size; + + for (swpindex = rounddown2(pindex, SWAP_META_PAGES);; + swpindex = sb->p + SWAP_META_PAGES) { + sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, + swpindex); + if (sb == NULL || sb->p >= respindex) + break; + for (int i = 0; i < SWAP_META_PAGES; i++) { + if (sb->p + i >= pindex && sb->p + i < respindex && + sb->d[i] != SWAPBLK_NONE && + swp_pager_cheri_has_tags(sb, i)) + return (sb->p + i); + } + } + + return (respindex); +} +#endif + // CHERI CHANGES START // { // "updated": 20230509, diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index eeddff9073b5..eb0ef7aa7fe3 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -85,5 +85,11 @@ u_long swap_pager_swapped_pages(vm_object_t object); void swapoff_all(void); bool swap_pager_init_object(vm_object_t object, void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset); + +#ifdef CHERI_CAPREVOKE +vm_pindex_t swap_pager_cheri_revoke_next(vm_object_t object, + vm_pindex_t pindex); +#endif + #endif /* _KERNEL */ #endif /* _VM_SWAP_PAGER_H_ */ diff --git a/sys/vm/vm_cheri_revoke.c b/sys/vm/vm_cheri_revoke.c index e1315298db39..4328f6741092 100644 --- a/sys/vm/vm_cheri_revoke.c +++ b/sys/vm/vm_cheri_revoke.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -512,7 +513,6 @@ static bool vm_cheri_revoke_skip_fault(vm_object_t object) { return (cheri_revoke_avoid_faults && (object->flags & OBJ_SWAP) != 0 && - pctrie_is_empty(&object->un_pager.swp.swp_blks) && (object->backing_object == NULL || (object->backing_object->flags & OBJ_HASCAP) == 0)); } @@ -638,26 +638,24 @@ vm_cheri_revoke_object_at(const struct vm_cheri_revoke_cookie *crc, (void)vm_page_grab_valid(&m, obj, ipi, VM_ALLOC_NOZERO); if (m == NULL) { - /* Can we avoid calling vm_fault() when the page is not resident? */ + /* + * Can we avoid calling vm_fault() for non-resident pages, or + * for paged-out pages that do not contain capabilities? + */ if (vm_cheri_revoke_skip_fault(obj)) { - /* Look forward in the object's collection of pages */ - vm_page_t obj_next_pg = vm_page_find_least(obj, ipi); + vm_pindex_t nextpindex; + vm_offset_t lastoff; - vm_offset_t lastoff = - entry->end - entry->start + entry->offset; - - if ((obj_next_pg == NULL) || - (obj_next_pg->pindex >= OFF_TO_IDX(lastoff))) { + lastoff = entry->end - entry->start + entry->offset; + nextpindex = swap_pager_cheri_revoke_next(obj, ipi); + if (nextpindex >= OFF_TO_IDX(lastoff)) { CHERI_REVOKE_STATS_INC(crst, pages_skip_fast, (entry->end - addr) >> PAGE_SHIFT); *ooff = lastoff; } else { - KASSERT(obj_next_pg->object == obj, - ("Fast find page in bad object?")); - - *ooff = IDX_TO_OFF(obj_next_pg->pindex); CHERI_REVOKE_STATS_INC(crst, pages_skip_fast, - obj_next_pg->pindex - ipi); + nextpindex - ipi); + *ooff = IDX_TO_OFF(nextpindex); } return (VM_CHERI_REVOKE_AT_OK); } From 0577a329b90818591cddfaf5c1702903fdfc6bf5 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 5 Aug 2024 18:30:13 -0400 Subject: [PATCH 2/3] caprevoke: Skip VM objects that have OBJ_HASCAP clear This skips over vnode mappings that have PROT_READ_CAP set in max_prot and thus avoids a lot of needless scanning. Add some counters so that we can see the relative effectiveness of different checks in reducing the number of scanned virtual pages. --- sys/vm/vm_cheri_revoke.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/sys/vm/vm_cheri_revoke.c b/sys/vm/vm_cheri_revoke.c index 4328f6741092..402ffcd779d1 100644 --- a/sys/vm/vm_cheri_revoke.c +++ b/sys/vm/vm_cheri_revoke.c @@ -83,6 +83,16 @@ SYSCTL_COUNTER_U64(_vm_stats_cheri_revoke, OID_AUTO, restarts, CTLFLAG_RD, &cheri_revoke_restarts, "Number of VM map re-lookups"); +static COUNTER_U64_DEFINE_EARLY(cheri_skip_prot_no_readcap); +SYSCTL_COUNTER_U64(_vm_stats_cheri_revoke, OID_AUTO, skip_prot_no_readcap, CTLFLAG_RD, + &cheri_skip_prot_no_readcap, + "Virtual pages skipped in map entries without readcap permission"); + +static COUNTER_U64_DEFINE_EARLY(cheri_skip_obj_no_hascap); +SYSCTL_COUNTER_U64(_vm_stats_cheri_revoke, OID_AUTO, skip_obj_no_hascap, CTLFLAG_RD, + &cheri_skip_obj_no_hascap, + "Virtual pages skipped in VM objects with no capabilities"); + /***************************** KERNEL THREADS ***************************/ static MALLOC_DEFINE(M_REVOKE, "cheri_revoke", "cheri_revoke temporary data"); @@ -905,6 +915,7 @@ vm_cheri_revoke_map_entry(const struct vm_cheri_revoke_cookie *crc, vm_offset_t ooffset; vm_object_t obj; enum vm_cro_at res; + int flags; KASSERT(!(entry->eflags & MAP_ENTRY_IS_SUB_MAP), ("cheri_revoke SUB_MAP")); @@ -915,9 +926,33 @@ vm_cheri_revoke_map_entry(const struct vm_cheri_revoke_cookie *crc, if (!obj) goto fini; - /* Skip entire mappings that do not permit capability reads */ - if ((entry->max_protection & VM_PROT_READ_CAP) == 0) + /* + * Skip mappings outright if we know they can't bear capabilities. + * Specifically, if one of the following applies: + * 1. OBJ_NOCAP is set (which implies it is set in backing objects), as + * is the case for the quarantine bitmap, + * 2. the mapping cannot acquire PROT_READ_CAP permission for the rest + * of its existence, + * 3. OBJ_HASCAP is unset, meaning that the object and its backing + * object cannot bear capabilities, + * we can safely jump to the next map entry. + * + * The object flags are not toggled after initialization, so it is safe + * to check them without the object lock. + */ + flags = atomic_load_int(&obj->flags); + if ((flags & OBJ_NOCAP) != 0) goto fini; + if ((entry->max_protection & VM_PROT_READ_CAP) == 0) { + counter_u64_add(cheri_skip_prot_no_readcap, + atop(entry->end - *addr)); + goto fini; + } + if ((flags & OBJ_HASCAP) == 0) { + counter_u64_add(cheri_skip_obj_no_hascap, + atop(entry->end - *addr)); + goto fini; + } VM_OBJECT_WLOCK(obj); while (*addr < entry->end) { From f0dce4c00441ce7523e3675f05b73d7580a13cbd Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 5 Aug 2024 19:57:17 -0400 Subject: [PATCH 3/3] caprevoke: Bail if the target process exited during an async scan This avoids some needless work in cases where a short-lived process triggers async revocation. --- sys/vm/vm_cheri_revoke.c | 50 ++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/sys/vm/vm_cheri_revoke.c b/sys/vm/vm_cheri_revoke.c index 402ffcd779d1..b2880a84d8e6 100644 --- a/sys/vm/vm_cheri_revoke.c +++ b/sys/vm/vm_cheri_revoke.c @@ -67,7 +67,8 @@ static void vm_cheri_revoke_pass_pre(vm_map_t); static void vm_cheri_revoke_pass_post(vm_map_t); -static int vm_cheri_revoke_pass_locked(const struct vm_cheri_revoke_cookie *); +static int vm_cheri_revoke_pass_locked(struct vmspace *, + const struct vm_cheri_revoke_cookie *); static SYSCTL_NODE(_vm_stats, OID_AUTO, cheri_revoke, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -93,6 +94,11 @@ SYSCTL_COUNTER_U64(_vm_stats_cheri_revoke, OID_AUTO, skip_obj_no_hascap, CTLFLAG &cheri_skip_obj_no_hascap, "Virtual pages skipped in VM objects with no capabilities"); +static COUNTER_U64_DEFINE_EARLY(cheri_last_ref_early_finish); +SYSCTL_COUNTER_U64(_vm_stats_cheri_revoke, OID_AUTO, last_ref_early_finish, CTLFLAG_RD, + &cheri_last_ref_early_finish, + "Scans finished early because the target exited"); + /***************************** KERNEL THREADS ***************************/ static MALLOC_DEFINE(M_REVOKE, "cheri_revoke", "cheri_revoke temporary data"); @@ -182,6 +188,7 @@ vm_cheri_revoke_kproc(void *arg __unused) mtx_unlock(&async_revoke_mtx); vmspace_switch_aio(arc->vm); + vmspace_free(arc->vm); /* * Advance the async state machine. @@ -192,7 +199,7 @@ vm_cheri_revoke_kproc(void *arg __unused) /* * Do the actual revocation pass. */ - error = vm_cheri_revoke_pass_locked(&arc->cookie); + error = vm_cheri_revoke_pass_locked(arc->vm, &arc->cookie); /* * A revocation pass is done. Advance the state machine again @@ -200,7 +207,6 @@ vm_cheri_revoke_kproc(void *arg __unused) */ vm_cheri_revoke_pass_async_post(map, error); - vmspace_free(arc->vm); free(arc, M_REVOKE); } @@ -907,10 +913,14 @@ vm_cheri_revoke_object_at(const struct vm_cheri_revoke_cookie *crc, * The map must be read-locked on entry and will be read-locked on exit, but * the lock may be dropped internally. The map must, therefore, also be * held across invocation. + * + * The containing vmspace may be referenced by the "vm" parameter to signal that + * the revocation sweep is asynchronous and should terminate if the caller ends + * up holding the final vmspace reference. */ static int vm_cheri_revoke_map_entry(const struct vm_cheri_revoke_cookie *crc, - vm_map_entry_t entry, vm_offset_t *addr) + struct vmspace *vm, vm_map_entry_t entry, vm_offset_t *addr) { vm_offset_t ooffset; vm_object_t obj; @@ -961,6 +971,13 @@ vm_cheri_revoke_map_entry(const struct vm_cheri_revoke_cookie *crc, vm_offset_t oaddr = *addr; #endif + /* Has the target process already exited? */ + if (vm != NULL && refcount_load(&vm->vm_refcnt) == 1) { + VM_OBJECT_WUNLOCK(obj); + counter_u64_add(cheri_last_ref_early_finish, 1); + return (KERN_NOT_RECEIVER); + } + /* Find ourselves in this object */ ooffset = *addr - entry->start + entry->offset; @@ -1033,7 +1050,8 @@ vm_cheri_revoke_pass_post(vm_map_t map) * returning. */ static int -vm_cheri_revoke_pass_locked(const struct vm_cheri_revoke_cookie *crc) +vm_cheri_revoke_pass_locked(struct vmspace *vm, + const struct vm_cheri_revoke_cookie *crc) { int res = KERN_SUCCESS; const vm_map_t map = crc->map; @@ -1056,13 +1074,21 @@ vm_cheri_revoke_pass_locked(const struct vm_cheri_revoke_cookie *crc) * XXX Somewhere around here we should be resetting * MPROT_QUARANTINE'd map entries to be usable again, yes? */ - res = vm_cheri_revoke_map_entry(crc, entry, &addr); + res = vm_cheri_revoke_map_entry(crc, vm, entry, &addr); - /* - * We might be bailing out because a page fault failed for - * catastrophic reasons (or polite ones like ptrace()). - */ - if (res != KERN_SUCCESS) { + switch (res) { + case KERN_SUCCESS: + break; + case KERN_NOT_RECEIVER: + /* The vmspace is orphaned, there is nothing to do. */ + res = KERN_SUCCESS; + goto out; + default: + /* + * We might be bailing out because a page fault failed + * for catastrophic reasons (or polite ones like + * ptrace()). + */ printf("CHERI revoke bail va=%lx res=%d\n", addr, res); goto out; } @@ -1106,7 +1132,7 @@ vm_cheri_revoke_pass(const struct vm_cheri_revoke_cookie *crc) int res; vm_cheri_revoke_pass_pre(crc->map); - res = vm_cheri_revoke_pass_locked(crc); + res = vm_cheri_revoke_pass_locked(NULL, crc); vm_cheri_revoke_pass_post(crc->map); return (res);