From d159e54a127d52a58ea81138281b2569fd499903 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Thu, 4 Apr 2024 14:34:57 -0700 Subject: [PATCH 1/5] Add new [ptr, ptr] -> ptr simdhash variant for caching --- src/mono/mono/metadata/CMakeLists.txt | 3 +- src/native/containers/containers.cmake | 2 + .../containers/dn-simdhash-ptrpair-ptr.c | 39 +++++++++++++++++++ .../containers/dn-simdhash-specializations.h | 15 +++++++ src/native/containers/dn-simdhash.c | 13 +++++++ src/native/containers/dn-simdhash.h | 5 +++ 6 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 src/native/containers/dn-simdhash-ptrpair-ptr.c diff --git a/src/mono/mono/metadata/CMakeLists.txt b/src/mono/mono/metadata/CMakeLists.txt index 9efd5ac0079c24..9d6010e95f3286 100644 --- a/src/mono/mono/metadata/CMakeLists.txt +++ b/src/mono/mono/metadata/CMakeLists.txt @@ -45,7 +45,8 @@ endif() set(imported_native_sources ../../../native/containers/dn-simdhash.c ../../../native/containers/dn-simdhash-string-ptr.c - ../../../native/containers/dn-simdhash-u32-ptr.c) + ../../../native/containers/dn-simdhash-u32-ptr.c + ../../../native/containers/dn-simdhash-ptrpair-ptr.c) set(metadata_common_sources appdomain.c diff --git a/src/native/containers/containers.cmake b/src/native/containers/containers.cmake index 16c41eab5619f8..4749dceea2dd9a 100644 --- a/src/native/containers/containers.cmake +++ b/src/native/containers/containers.cmake @@ -13,6 +13,8 @@ list(APPEND SHARED_CONTAINER_SOURCES # dn-simdhash-string-ptr.c # dn-simdhash-u32-ptr.c # dn-simdhash-ptr-ptr.c + # dn-simdhash-ght-compatible.c + # dn-simdhash-ptrpair-ptr.c ) list(APPEND SHARED_CONTAINER_HEADERS diff --git a/src/native/containers/dn-simdhash-ptrpair-ptr.c b/src/native/containers/dn-simdhash-ptrpair-ptr.c new file mode 100644 index 00000000000000..d377647b6636ac --- /dev/null +++ b/src/native/containers/dn-simdhash-ptrpair-ptr.c @@ -0,0 +1,39 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include +#include "dn-simdhash.h" + +#include "dn-simdhash-utils.h" + +typedef struct dn_ptrpair_t { + void *first; + void *second; +} dn_ptrpair_t; + +static inline uint32_t +dn_ptrpair_t_hash (dn_ptrpair_t key) +{ + return (MurmurHash3_32_ptr(key.first, 0) ^ MurmurHash3_32_ptr(key.second, 1)); +} + +static inline uint8_t +dn_ptrpair_t_equals (dn_ptrpair_t lhs, dn_ptrpair_t rhs) +{ + return (lhs.first == rhs.first) && (lhs.second == rhs.second); +} + +#define DN_SIMDHASH_T dn_simdhash_ptrpair_ptr +#define DN_SIMDHASH_KEY_T dn_ptrpair_t +#define DN_SIMDHASH_VALUE_T void * +#define DN_SIMDHASH_KEY_HASHER(hash, key) dn_ptrpair_t_hash(key) +#define DN_SIMDHASH_KEY_EQUALS(hash, lhs, rhs) dn_ptrpair_t_equals(lhs, rhs) +#if SIZEOF_VOID_P == 8 +// 192 bytes holds 12 16-byte blocks, so 11 keys and one suffix table +#define DN_SIMDHASH_BUCKET_CAPACITY 11 +#else +// 128 bytes holds 16 8-byte blocks, so 14 keys and one suffix table +#define DN_SIMDHASH_BUCKET_CAPACITY 14 +#endif + +#include "dn-simdhash-specialization.h" diff --git a/src/native/containers/dn-simdhash-specializations.h b/src/native/containers/dn-simdhash-specializations.h index 4966c7575d19a0..9533edfc5f3d1f 100644 --- a/src/native/containers/dn-simdhash-specializations.h +++ b/src/native/containers/dn-simdhash-specializations.h @@ -59,4 +59,19 @@ typedef struct dn_simdhash_str_key dn_simdhash_str_key; #include "dn-simdhash-ght-compatible.h" + +typedef struct dn_ptrpair_t { + void *first, *second; +} dn_ptrpair_t; + +#define DN_SIMDHASH_T dn_simdhash_ptrpair_ptr +#define DN_SIMDHASH_KEY_T dn_ptrpair_t +#define DN_SIMDHASH_VALUE_T void * + +#include "dn-simdhash-specialization-declarations.h" + +#undef DN_SIMDHASH_T +#undef DN_SIMDHASH_KEY_T +#undef DN_SIMDHASH_VALUE_T + #endif diff --git a/src/native/containers/dn-simdhash.c b/src/native/containers/dn-simdhash.c index 03d4d2bf3951aa..31c5f94a0cfc81 100644 --- a/src/native/containers/dn-simdhash.c +++ b/src/native/containers/dn-simdhash.c @@ -140,6 +140,19 @@ dn_simdhash_count (dn_simdhash_t *hash) return hash->count; } +uint32_t +dn_simdhash_overflow_count (dn_simdhash_t *hash) +{ + assert(hash); + uint32_t result = 0; + for (uint32_t bucket_index = 0; bucket_index < hash->buffers.buckets_length; bucket_index++) { + uint8_t *suffixes = ((uint8_t *)hash->buffers.buckets) + (bucket_index * hash->meta->bucket_size_bytes); + uint8_t cascade_count = suffixes[DN_SIMDHASH_CASCADED_SLOT]; + result += cascade_count; + } + return result; +} + void dn_simdhash_ensure_capacity (dn_simdhash_t *hash, uint32_t capacity) { diff --git a/src/native/containers/dn-simdhash.h b/src/native/containers/dn-simdhash.h index da4a7914e18873..a2d6e87c9045fe 100644 --- a/src/native/containers/dn-simdhash.h +++ b/src/native/containers/dn-simdhash.h @@ -144,6 +144,11 @@ dn_simdhash_capacity (dn_simdhash_t *hash); uint32_t dn_simdhash_count (dn_simdhash_t *hash); +// Returns the estimated number of items that have overflowed out of a bucket. +// WARNING: This is expensive to calculate. +uint32_t +dn_simdhash_overflow_count (dn_simdhash_t *hash); + // Automatically resizes the table if it is too small to hold the requested number // of items. Will not shrink the table if it is already bigger. void From 60ae192b45bbae95d5a4af7a39057b8480f40737 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Thu, 4 Apr 2024 15:00:30 -0700 Subject: [PATCH 2/5] Cache mono_class_implement_interface_slow because we perform many redundant calls to it during application startup --- src/mono/mono/metadata/class-setup-vtable.c | 7 +- src/mono/mono/metadata/class.c | 105 +++++++++++++++++++- src/native/containers/dn-simdhash.c | 3 +- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/src/mono/mono/metadata/class-setup-vtable.c b/src/mono/mono/metadata/class-setup-vtable.c index 9a235c1ec0cced..9cbe568d81dcf5 100644 --- a/src/mono/mono/metadata/class-setup-vtable.c +++ b/src/mono/mono/metadata/class-setup-vtable.c @@ -773,6 +773,11 @@ mono_method_get_method_definition (MonoMethod *method) static gboolean verify_class_overrides (MonoClass *klass, MonoMethod **overrides, int onum) { +#ifndef DEBUG + if (klass->image == mono_defaults.corlib) + return TRUE; +#endif + int i; for (i = 0; i < onum; ++i) { @@ -1760,7 +1765,7 @@ mono_class_setup_vtable_general (MonoClass *klass, MonoMethod **overrides, int o MonoMethod *override = iface_overrides [i*2 + 1]; if (mono_class_is_gtd (override->klass)) { override = mono_class_inflate_generic_method_full_checked (override, ic, mono_class_get_context (ic), error); - } + } // there used to be code here to inflate decl if decl->is_inflated, but in https://github.com/dotnet/runtime/pull/64102#discussion_r790019545 we // think that this does not correspond to any real code. if (!apply_override (klass, ic, vtable, decl, override, &override_map, &override_class_map, &conflict_map)) diff --git a/src/mono/mono/metadata/class.c b/src/mono/mono/metadata/class.c index 5ae4f1981d38ac..0fcb3c6e9709f7 100644 --- a/src/mono/mono/metadata/class.c +++ b/src/mono/mono/metadata/class.c @@ -4331,12 +4331,16 @@ mono_class_is_variant_compatible_slow (MonoClass *klass, MonoClass *oklass) } return TRUE; } -/*Check if @candidate implements the interface @target*/ + static gboolean -mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) +mono_class_implement_interface_slow_cached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache); + +static gboolean +mono_class_implement_interface_slow_uncached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache) { ERROR_DECL (error); int i; + gboolean is_variant = mono_class_has_variant_generic_params (target); if (is_variant && MONO_CLASS_IS_INTERFACE_INTERNAL (candidate)) { @@ -4344,6 +4348,17 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) return TRUE; } + /* + MonoVTable *vt = m_class_is_inited (target) ? m_class_get_runtime_vtable (target) : NULL; + if (vt) { + g_printf ( + "vtable fast path in implement_interface_slow for '%s.%s'\n", + m_class_get_name_space (target), m_class_get_name (target) + ); + return MONO_VTABLE_IMPLEMENTS_INTERFACE (vt, m_class_get_interface_id (candidate)); + } + */ + do { if (candidate == target) return TRUE; @@ -4365,7 +4380,7 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) return TRUE; if (is_variant && mono_class_is_variant_compatible_slow (target, iface_class)) return TRUE; - if (mono_class_implement_interface_slow (target, iface_class)) + if (mono_class_implement_interface_slow_cached (target, iface_class, cache)) return TRUE; } } @@ -4390,7 +4405,7 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) if (is_variant && mono_class_is_variant_compatible_slow (target, candidate_interfaces [i])) return TRUE; - if (mono_class_implement_interface_slow (target, candidate_interfaces [i])) + if (mono_class_implement_interface_slow_cached (target, candidate_interfaces [i], cache)) return TRUE; } } @@ -4400,6 +4415,85 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) return FALSE; } +#define LOG_INTERFACE_CACHE_HITS 0 + +#if LOG_INTERFACE_CACHE_HITS +static gint64 implement_interface_hits = 0, implement_interface_misses = 0; + +static void +log_hit_rate (dn_simdhash_ptrpair_ptr_t *cache) +{ + gint64 total_calls = implement_interface_hits + implement_interface_misses; + if ((total_calls % 500) != 0) + return; + double hit_rate = implement_interface_hits * 100.0 / total_calls; + g_printf ("implement_interface cache hit rate: %f (%lld total calls). Overflow count: %u\n", hit_rate, total_calls, dn_simdhash_overflow_count (cache)); +} +#endif + +static gboolean +mono_class_implement_interface_slow_cached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache) +{ + // Skip the caching logic for exact matches + if (candidate == target) + return TRUE; + + gpointer cached_result = NULL; + gboolean result; + dn_ptrpair_t key = { target, candidate }; + if (dn_simdhash_ptrpair_ptr_try_get_value (cache, key, &cached_result)) { + // Testing shows a cache hit rate of 60% on S.R.Tests and S.T.J.Tests, + // and 40-50% for small app startup. Near-zero overflow count. +#if LOG_INTERFACE_CACHE_HITS + implement_interface_hits++; + log_hit_rate (cache); +#endif + return (cached_result != NULL); + } + result = mono_class_implement_interface_slow_uncached (target, candidate, cache); + dn_simdhash_ptrpair_ptr_try_add (cache, key, result ? GUINT_TO_POINTER(1) : NULL); +#if LOG_INTERFACE_CACHE_HITS + implement_interface_misses++; + log_hit_rate (cache); +#endif + return result; +} + +static dn_simdhash_ptrpair_ptr_t *implement_interface_scratch_cache = NULL; + +/*Check if @candidate implements the interface @target*/ +static gboolean +mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) +{ + gpointer cas_result; + gboolean result; + dn_simdhash_ptrpair_ptr_t *cache = (dn_simdhash_ptrpair_ptr_t *)mono_atomic_xchg_ptr ((volatile gpointer *)&implement_interface_scratch_cache, NULL); + if (!cache) + // Roughly 64KB of memory usage and big enough to have fast lookups + // Smaller is viable but makes the hit rate worse + cache = dn_simdhash_ptrpair_ptr_new (2048, NULL); + else if (dn_simdhash_count (cache) >= 2250) { + // FIXME: 2250 is arbitrary (roughly 256 11-item buckets w/load factor) + // One step down reduces hit rate by approximately 2-4% + // HACK: Only clear the scratch cache once it gets too big. + // The pattern is that (especially during startup), we have lots + // of mono_class_implement_interface_slow calls back to back that + // perform similar checks, so keeping the cache data around between + // sequential calls will potentially optimize them a lot. + dn_simdhash_clear (cache); + } + + result = mono_class_implement_interface_slow_cached (target, candidate, cache); + + cas_result = mono_atomic_cas_ptr ((volatile gpointer *)&implement_interface_scratch_cache, cache, NULL); + if (cas_result != NULL) { + g_printf ("freeing extra implement_interface cache\n"); + dn_simdhash_free (cache); + } + + return result; +} + /* * Check if @oklass can be assigned to @klass. * This function does the same as mono_class_is_assignable_from_internal but is safe to be used from mono_class_init_internal context. @@ -4416,8 +4510,9 @@ mono_class_is_assignable_from_slow (MonoClass *target, MonoClass *candidate) return TRUE; /*If target is not an interface there is no need to check them.*/ - if (MONO_CLASS_IS_INTERFACE_INTERNAL (target)) + if (MONO_CLASS_IS_INTERFACE_INTERNAL (target)) { return mono_class_implement_interface_slow (target, candidate); + } if (m_class_is_delegate (target) && mono_class_has_variant_generic_params (target)) return mono_class_is_variant_compatible (target, candidate, FALSE); diff --git a/src/native/containers/dn-simdhash.c b/src/native/containers/dn-simdhash.c index 31c5f94a0cfc81..d1e2b6e330b6ef 100644 --- a/src/native/containers/dn-simdhash.c +++ b/src/native/containers/dn-simdhash.c @@ -119,8 +119,7 @@ dn_simdhash_clear (dn_simdhash_t *hash) if (hash->vtable.destroy_all) hash->vtable.destroy_all(hash); hash->count = 0; - // TODO: Scan through buckets sequentially and only erase ones with data in them - // Maybe skip erasing the key slots too? + // TODO: Implement a fast clear algorithm that scans buckets and only clears ones w/nonzero count memset(hash->buffers.buckets, 0, hash->buffers.buckets_length * hash->meta->bucket_size_bytes); // Skip this for performance; memset is especially slow in wasm // memset(hash->buffers.values, 0, hash->buffers.values_length * hash->meta->value_size); From a688d373ec5c48d78253b70e4d6b10765ef8d760 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Tue, 16 Apr 2024 20:41:48 -0700 Subject: [PATCH 3/5] Cleanup --- src/mono/mono/metadata/class.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/metadata/class.c b/src/mono/mono/metadata/class.c index 0fcb3c6e9709f7..695e93815fe3ba 100644 --- a/src/mono/mono/metadata/class.c +++ b/src/mono/mono/metadata/class.c @@ -4485,11 +4485,11 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate) result = mono_class_implement_interface_slow_cached (target, candidate, cache); + // Under most circumstances we won't have multiple threads competing to run implement_interface_slow, + // so it's not worth making this thread-local and potentially keeping a cache instance around per-thread. cas_result = mono_atomic_cas_ptr ((volatile gpointer *)&implement_interface_scratch_cache, cache, NULL); - if (cas_result != NULL) { - g_printf ("freeing extra implement_interface cache\n"); + if (cas_result != NULL) dn_simdhash_free (cache); - } return result; } From 95f99a88803e77eb9874f0adbf92af50d1526e78 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Fri, 19 Apr 2024 13:41:28 -0700 Subject: [PATCH 4/5] Better ndebug/debug --- src/mono/mono/metadata/class-setup-vtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/metadata/class-setup-vtable.c b/src/mono/mono/metadata/class-setup-vtable.c index 9cbe568d81dcf5..21b00aad630b02 100644 --- a/src/mono/mono/metadata/class-setup-vtable.c +++ b/src/mono/mono/metadata/class-setup-vtable.c @@ -773,7 +773,9 @@ mono_method_get_method_definition (MonoMethod *method) static gboolean verify_class_overrides (MonoClass *klass, MonoMethod **overrides, int onum) { -#ifndef DEBUG + // on windows and arm, we define NDEBUG for release builds + // on browser and wasi, we define DEBUG for debug builds +#if defined(NDEBUG) || !defined(DEBUG) if (klass->image == mono_defaults.corlib) return TRUE; #endif From 4418719c44cf4bf058a3838eb5e825f507bf8bec Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Mon, 22 Apr 2024 14:08:17 -0700 Subject: [PATCH 5/5] Address PR feedback Verify cache in checked builds --- src/mono/mono/metadata/class-setup-vtable.c | 2 +- src/mono/mono/metadata/class.c | 55 ++++++++++++--------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/mono/mono/metadata/class-setup-vtable.c b/src/mono/mono/metadata/class-setup-vtable.c index 21b00aad630b02..62afa76dde5b74 100644 --- a/src/mono/mono/metadata/class-setup-vtable.c +++ b/src/mono/mono/metadata/class-setup-vtable.c @@ -775,7 +775,7 @@ verify_class_overrides (MonoClass *klass, MonoMethod **overrides, int onum) { // on windows and arm, we define NDEBUG for release builds // on browser and wasi, we define DEBUG for debug builds -#if defined(NDEBUG) || !defined(DEBUG) +#ifdef ENABLE_CHECKED_BUILD if (klass->image == mono_defaults.corlib) return TRUE; #endif diff --git a/src/mono/mono/metadata/class.c b/src/mono/mono/metadata/class.c index 695e93815fe3ba..6045628bf49e7f 100644 --- a/src/mono/mono/metadata/class.c +++ b/src/mono/mono/metadata/class.c @@ -4348,17 +4348,6 @@ mono_class_implement_interface_slow_uncached (MonoClass *target, MonoClass *cand return TRUE; } - /* - MonoVTable *vt = m_class_is_inited (target) ? m_class_get_runtime_vtable (target) : NULL; - if (vt) { - g_printf ( - "vtable fast path in implement_interface_slow for '%s.%s'\n", - m_class_get_name_space (target), m_class_get_name (target) - ); - return MONO_VTABLE_IMPLEMENTS_INTERFACE (vt, m_class_get_interface_id (candidate)); - } - */ - do { if (candidate == target) return TRUE; @@ -4415,7 +4404,7 @@ mono_class_implement_interface_slow_uncached (MonoClass *target, MonoClass *cand return FALSE; } -#define LOG_INTERFACE_CACHE_HITS 0 +// #define LOG_INTERFACE_CACHE_HITS 1 #if LOG_INTERFACE_CACHE_HITS static gint64 implement_interface_hits = 0, implement_interface_misses = 0; @@ -4434,29 +4423,51 @@ log_hit_rate (dn_simdhash_ptrpair_ptr_t *cache) static gboolean mono_class_implement_interface_slow_cached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache) { + gpointer cached_result = NULL; + dn_ptrpair_t key = { target, candidate }; + gboolean result = 0, cache_hit = 0; + // Skip the caching logic for exact matches if (candidate == target) return TRUE; - gpointer cached_result = NULL; - gboolean result; - dn_ptrpair_t key = { target, candidate }; - if (dn_simdhash_ptrpair_ptr_try_get_value (cache, key, &cached_result)) { + cache_hit = dn_simdhash_ptrpair_ptr_try_get_value (cache, key, &cached_result); + if (cache_hit) { // Testing shows a cache hit rate of 60% on S.R.Tests and S.T.J.Tests, // and 40-50% for small app startup. Near-zero overflow count. #if LOG_INTERFACE_CACHE_HITS implement_interface_hits++; log_hit_rate (cache); #endif - return (cached_result != NULL); + result = (cached_result != NULL); +#ifndef ENABLE_CHECKED_BUILD + return result; +#endif } - result = mono_class_implement_interface_slow_uncached (target, candidate, cache); - dn_simdhash_ptrpair_ptr_try_add (cache, key, result ? GUINT_TO_POINTER(1) : NULL); + + gboolean uncached_result = mono_class_implement_interface_slow_uncached (target, candidate, cache); + + if (!cache_hit) { #if LOG_INTERFACE_CACHE_HITS - implement_interface_misses++; - log_hit_rate (cache); + implement_interface_misses++; + log_hit_rate (cache); #endif - return result; + dn_simdhash_ptrpair_ptr_try_add (cache, key, uncached_result ? GUINT_TO_POINTER(1) : NULL); + } + +#ifdef ENABLE_CHECKED_BUILD + if (cache_hit) { + if (result != uncached_result) + g_print ( + "Cache mismatch for %s.%s and %s.%s: cached=%d, uncached=%d\n", + m_class_get_name_space (target), m_class_get_name (target), + m_class_get_name_space (candidate), m_class_get_name (candidate), + result, uncached_result + ); + g_assert (result == uncached_result); + } +#endif + return uncached_result; } static dn_simdhash_ptrpair_ptr_t *implement_interface_scratch_cache = NULL;