From 44a2c54c9d3956f072eff890120dde4ac178edf8 Mon Sep 17 00:00:00 2001 From: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> Date: Mon, 11 Jul 2022 23:36:08 -0700 Subject: [PATCH] Use macOS 12 / iOS 15 perflevel sysctls for better cache info on Apple platforms --- src/arm/mach/init.c | 884 +++++++++++++++++++++++++++++++++++--------- tools/cache-info.c | 63 +++- 2 files changed, 775 insertions(+), 172 deletions(-) diff --git a/src/arm/mach/init.c b/src/arm/mach/init.c index dbea578c..79b31b20 100644 --- a/src/arm/mach/init.c +++ b/src/arm/mach/init.c @@ -14,6 +14,12 @@ #include #include +#define SAFE_FREE_NULL(x) \ + do { \ + if((x)) free((x)); \ + (x) = NULL; \ + } while(0); + /* Polyfill recent CPUFAMILY_ARM_* values for older SDKs */ #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xE81E7EF6 @@ -54,6 +60,54 @@ struct cpuinfo_arm_isa cpuinfo_isa = { #endif }; +struct cache_array { + struct cpuinfo_cache *caches; + uint32_t count; +}; + +/* + * iOS 15 and macOS Monterey 12 added sysctls to describe configuration information + * where not all cores are the same (number of cores, cache sizes). + * + * Each perflevel sysctl has a prefix of `hw.perflevel??.` where ?? is the + * perflevel index, starting at zero. The total number of perflevels are + * exposed via the `hw.nperflevels` sysctl. Higher performance perflevels + * have lower indexes. + * + * sysctls: + * - hw.nperflevels - number of different types of cores / cache configs (perflevels) + * - hw.perflevel?? + * - .physicalcpu - number of enabled physical cores for perflevel ?? + * - .physicalcpu_max - number of physical cores for perflevel ?? + * - .logicalcpu - number of enabled logical cores for perflevel ?? + * - .logicalcpu_max - number of logical cores for perflevel ?? + * - .l1icachesize - size in bytes of L1 instruction cache for cores in perflevel ?? + * - .l1dcachesize - size in bytes of L1 data cache for cores in perflevel ?? + * - .l2cachesize - size in bytes of L2 data cache for cores in perflevel ?? + * - .cpusperl2 - number of cores that share an L2 cache in perflevel ?? + * - .l3cachesize - size in bytes of L3 data cache for cores in perflevel ?? + * - .cpusperl3 - number of cores that share an L3 cache in perflevel ?? + * + * Technically, these perflevels could be in src/mach/api.h since they are supported + * across architectures (x86_64 and arm64). x86_64 doesn't currently have multiple + * perflevels, which means there's not much benefit there. + */ +struct mach_perflevel { + uint32_t physicalcpu; + uint32_t physicalcpu_max; + uint32_t logicalcpu; + uint32_t logicalcpu_max; + uint32_t l1icachesize; + uint32_t l1dcachesize; + uint32_t l2cachesize; + uint32_t cpusperl2; + uint32_t l3cachesize; + uint32_t cpusperl3; + + uint32_t core_start; /* first core index this perflevel describes */ + uint32_t processor_start; /* first processor index this perflevel describes */ +}; + static uint32_t get_sys_info(int type_specifier, const char* name) { size_t size = 0; uint32_t result = 0; @@ -128,7 +182,7 @@ static enum cpuinfo_uarch decode_uarch(uint32_t cpu_family, uint32_t cpu_subtype #endif } -static void decode_package_name(char* package_name) { +static void decode_hw_machine_package_name(char* package_name) { size_t size; if (sysctlbyname("hw.machine", NULL, &size, NULL, 0) != 0) { cpuinfo_log_warning("sysctlbyname(\"hw.machine\") failed: %s", strerror(errno)); @@ -252,55 +306,30 @@ static void decode_package_name(char* package_name) { } } -void cpuinfo_arm_mach_init(void) { - struct cpuinfo_processor* processors = NULL; - struct cpuinfo_core* cores = NULL; - struct cpuinfo_cluster* clusters = NULL; - struct cpuinfo_package* packages = NULL; - struct cpuinfo_uarch_info* uarchs = NULL; - struct cpuinfo_cache* l1i = NULL; - struct cpuinfo_cache* l1d = NULL; - struct cpuinfo_cache* l2 = NULL; - struct cpuinfo_cache* l3 = NULL; - - struct cpuinfo_mach_topology mach_topology = cpuinfo_mach_detect_topology(); - processors = calloc(mach_topology.threads, sizeof(struct cpuinfo_processor)); - if (processors == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" logical processors", - mach_topology.threads * sizeof(struct cpuinfo_processor), mach_topology.threads); - goto cleanup; - } - cores = calloc(mach_topology.cores, sizeof(struct cpuinfo_core)); - if (cores == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" cores", - mach_topology.cores * sizeof(struct cpuinfo_core), mach_topology.cores); - goto cleanup; - } - packages = calloc(mach_topology.packages, sizeof(struct cpuinfo_package)); - if (packages == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" packages", - mach_topology.packages * sizeof(struct cpuinfo_package), mach_topology.packages); - goto cleanup; +static void read_package_name(char* package_name) { + decode_hw_machine_package_name(package_name); + if (package_name[0] != '\0') { + return; } - const uint32_t threads_per_core = mach_topology.threads / mach_topology.cores; - const uint32_t threads_per_package = mach_topology.threads / mach_topology.packages; - const uint32_t cores_per_package = mach_topology.cores / mach_topology.packages; + /* Try to pull package name from machdep.cpu.brand_string */ + size_t size; + if (sysctlbyname("machdep.cpu.brand_string", NULL, &size, NULL, 0) != 0) { + cpuinfo_log_warning("sysctlbyname(\"machdep.cpu.brand_string\") failed: %s", strerror(errno)); + return; + } - for (uint32_t i = 0; i < mach_topology.packages; i++) { - packages[i] = (struct cpuinfo_package) { - .processor_start = i * threads_per_package, - .processor_count = threads_per_package, - .core_start = i * cores_per_package, - .core_count = cores_per_package, - }; - decode_package_name(packages[i].name); + char *brand_string = alloca(size); + if (sysctlbyname("machdep.cpu.brand_string", brand_string, &size, NULL, 0) != 0) { + cpuinfo_log_warning("sysctlbyname(\"machdep.cpu.brand_string\") failed: %s", strerror(errno)); + return; } + cpuinfo_log_debug("machdep.cpu.brand_string: %s", brand_string); + strlcpy(package_name, brand_string, CPUINFO_PACKAGE_NAME_MAX); +} - const uint32_t cpu_family = get_sys_info_by_name("hw.cpufamily"); - const uint32_t cpu_type = get_sys_info_by_name("hw.cputype"); - const uint32_t cpu_subtype = get_sys_info_by_name("hw.cpusubtype"); +static void detect_isa(uint32_t cpu_family, uint32_t cpu_type, uint32_t cpu_subtype) { switch (cpu_type) { case CPU_TYPE_ARM64: cpuinfo_isa.aes = true; @@ -365,81 +394,168 @@ void cpuinfo_arm_mach_init(void) { case CPUFAMILY_ARM_FIRESTORM_ICESTORM: cpuinfo_isa.dot = true; } +} - uint32_t num_clusters = 1; - for (uint32_t i = 0; i < mach_topology.cores; i++) { - cores[i] = (struct cpuinfo_core) { - .processor_start = i * threads_per_core, - .processor_count = threads_per_core, - .core_id = i % cores_per_package, - .package = packages + i / cores_per_package, - .vendor = cpuinfo_vendor_apple, - .uarch = decode_uarch(cpu_family, cpu_subtype, i, mach_topology.cores), - }; - if (i != 0 && cores[i].uarch != cores[i - 1].uarch) { - num_clusters++; - } +static char * alloc_sysctl_perflevel_string(uint32_t perflevel, const char * const perflevel_suffix) { + char * ret = NULL; + int err = asprintf(&ret, "hw.perflevel%u.%s", perflevel, perflevel_suffix); + if (err == -1 || ret == NULL) { + cpuinfo_log_error("failed to allocate memory for hw.perflevel* string"); + return NULL; } - for (uint32_t i = 0; i < mach_topology.threads; i++) { - const uint32_t smt_id = i % threads_per_core; - const uint32_t core_id = i / threads_per_core; - const uint32_t package_id = i / threads_per_package; - processors[i].smt_id = smt_id; - processors[i].core = &cores[core_id]; - processors[i].package = &packages[package_id]; - } + return ret; +} - clusters = calloc(num_clusters, sizeof(struct cpuinfo_cluster)); - if (clusters == NULL) { - cpuinfo_log_error( - "failed to allocate %zu bytes for descriptions of %"PRIu32" clusters", - num_clusters * sizeof(struct cpuinfo_cluster), num_clusters); - goto cleanup; - } - uarchs = calloc(num_clusters, sizeof(struct cpuinfo_uarch_info)); - if (uarchs == NULL) { - cpuinfo_log_error( - "failed to allocate %zu bytes for descriptions of %"PRIu32" uarchs", - num_clusters * sizeof(enum cpuinfo_uarch), num_clusters); - goto cleanup; +static struct mach_perflevel * read_perflevels(const uint32_t nperflevels) { + struct mach_perflevel * perflevels = NULL; + + perflevels = calloc(nperflevels, sizeof (*perflevels)); + if (!perflevels) { + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" perflevels", + nperflevels * sizeof(*perflevels), nperflevels); + return NULL; } - uint32_t cluster_idx = UINT32_MAX; - for (uint32_t i = 0; i < mach_topology.cores; i++) { - if (i == 0 || cores[i].uarch != cores[i - 1].uarch) { - cluster_idx++; - uarchs[cluster_idx] = (struct cpuinfo_uarch_info) { - .uarch = cores[i].uarch, - .processor_count = 1, - .core_count = 1, - }; - clusters[cluster_idx] = (struct cpuinfo_cluster) { - .processor_start = i * threads_per_core, - .processor_count = 1, - .core_start = i, - .core_count = 1, - .cluster_id = cluster_idx, - .package = cores[i].package, - .vendor = cores[i].vendor, - .uarch = cores[i].uarch, - }; + + /* + * Each perflevel sysctl is of the form "hw.perflevel." + * where is an integer starting at zero and incrementing for each level + * and is the name of the sysctl. Since they change based on the + * level, we need to build them dynamically. + */ + char * sysctl_physicalcpu = NULL; + char * sysctl_physicalcpu_max = NULL; + char * sysctl_logicalcpu = NULL; + char * sysctl_logicalcpu_max = NULL; + char * sysctl_l1icachesize = NULL; + char * sysctl_l1dcachesize = NULL; + char * sysctl_l2cachesize = NULL; + char * sysctl_cpusperl2 = NULL; + char * sysctl_l3cachesize = NULL; + char * sysctl_cpusperl3 = NULL; + + uint32_t core_index = 0; + uint32_t processor_index = 0; + + bool success = true; + uint32_t i = 0; + for (; icount = mach_topology.threads / threads_per_l1; + l1d->count = l1i->count; + cpuinfo_log_debug("detected %"PRIu32" L1 caches", l1i->count); } - uint32_t threads_per_l2 = 0, l2_count = 0; + uint32_t threads_per_l2 = 0; if (l2_cache_size != 0) { /* Assume L2 cache is shared between all cores */ threads_per_l2 = mach_topology.cores; - l2_count = 1; - cpuinfo_log_debug("detected %"PRIu32" L2 caches", l2_count); + l2->count = 1; + cpuinfo_log_debug("detected %"PRIu32" L2 caches", l2->count); } - uint32_t threads_per_l3 = 0, l3_count = 0; + uint32_t threads_per_l3 = 0; if (l3_cache_size != 0) { /* Assume L3 cache is shared between all cores */ threads_per_l3 = mach_topology.cores; - l3_count = 1; - cpuinfo_log_debug("detected %"PRIu32" L3 caches", l3_count); + l3->count = 1; + cpuinfo_log_debug("detected %"PRIu32" L3 caches", l3->count); } if (l1i_cache_size != 0) { - l1i = calloc(l1_count, sizeof(struct cpuinfo_cache)); - if (l1i == NULL) { + l1i->caches = calloc(l1i->count, sizeof(*(l1i->caches))); + if (l1i->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1I caches", - l1_count * sizeof(struct cpuinfo_cache), l1_count); - goto cleanup; + l1i->count * sizeof(*(l1i->caches)), l1i->count); + return false; } - for (uint32_t c = 0; c < l1_count; c++) { - l1i[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l1i->count; c++) { + l1i->caches[c] = (struct cpuinfo_cache) { .size = l1i_cache_size, .associativity = l1_cache_associativity, .sets = l1i_cache_size / (l1_cache_associativity * cacheline_size), @@ -496,20 +617,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l1, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l1i = &l1i[t / threads_per_l1]; - } } if (l1d_cache_size != 0) { - l1d = calloc(l1_count, sizeof(struct cpuinfo_cache)); - if (l1d == NULL) { + l1d->caches = calloc(l1d->count, sizeof(*(l1d->caches))); + if (l1d->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1D caches", - l1_count * sizeof(struct cpuinfo_cache), l1_count); - goto cleanup; + l1d->count * sizeof(*(l1d->caches)), l1d->count); + return false; } - for (uint32_t c = 0; c < l1_count; c++) { - l1d[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l1d->count; c++) { + l1d->caches[c] = (struct cpuinfo_cache) { .size = l1d_cache_size, .associativity = l1_cache_associativity, .sets = l1d_cache_size / (l1_cache_associativity * cacheline_size), @@ -520,20 +638,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l1, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l1d = &l1d[t / threads_per_l1]; - } } - if (l2_count != 0) { - l2 = calloc(l2_count, sizeof(struct cpuinfo_cache)); - if (l2 == NULL) { + if (l2->count != 0) { + l2->caches = calloc(l2->count, sizeof(*(l2->caches))); + if (l2->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L2 caches", - l2_count * sizeof(struct cpuinfo_cache), l2_count); - goto cleanup; + l2->count * sizeof(*(l2->caches)), l2->count); + return false; } - for (uint32_t c = 0; c < l2_count; c++) { - l2[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l2->count; c++) { + l2->caches[c] = (struct cpuinfo_cache) { .size = l2_cache_size, .associativity = l2_cache_associativity, .sets = l2_cache_size / (l2_cache_associativity * cacheline_size), @@ -544,20 +659,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l2, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l2 = &l2[0]; - } } - if (l3_count != 0) { - l3 = calloc(l3_count, sizeof(struct cpuinfo_cache)); - if (l3 == NULL) { + if (l3->count != 0) { + l3->caches = calloc(l3->count, sizeof(*(l3->caches))); + if (l3->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L3 caches", - l3_count * sizeof(struct cpuinfo_cache), l3_count); - goto cleanup; + l3->count * sizeof(*(l3->caches)), l3->count); + return false; } - for (uint32_t c = 0; c < l3_count; c++) { - l3[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l3->count; c++) { + l3->caches[c] = (struct cpuinfo_cache) { .size = l3_cache_size, .associativity = l3_cache_associativity, .sets = l3_cache_size / (l3_cache_associativity * cacheline_size), @@ -568,31 +680,468 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l3, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l3 = &l3[0]; + } + + return true; +} + +bool detect_caches_using_perflevels( + const struct cpuinfo_mach_topology mach_topology, + const struct mach_perflevel * const perflevels, + const uint32_t nperflevels, + struct cache_array *l1i, + struct cache_array *l1d, + struct cache_array *l2, + struct cache_array *l3 +) +{ + if (!l1i || !l1d || !l2 || !l3) { + cpuinfo_log_error("cannot detect caches. no place to store results."); + return false; + } + + const uint32_t cacheline_size = get_sys_info(HW_CACHELINE, "HW_CACHELINE"); + /* + * Cache associativity, partitions, and flags values here are copied from + * previous implementation. + */ + const uint32_t l1_cache_associativity = 4; + const uint32_t l2_cache_associativity = 8; + const uint32_t l3_cache_associativity = 16; + const uint32_t cache_partitions = 1; + const uint32_t cache_flags = 0; + + l1i->count = 0; + l1d->count = 0; + l2->count = 0; + l3->count = 0; + for (uint32_t pl=0; plcount += perflevels[pl].physicalcpu_max; + } + + if (perflevels[pl].l1dcachesize != 0) { + /* One l1d cache per core */ + l1d->count += perflevels[pl].physicalcpu_max; + } + + if (perflevels[pl].cpusperl2 != 0) { + l2->count += perflevels[pl].physicalcpu_max / perflevels[pl].cpusperl2; + } + + if (perflevels[pl].cpusperl3 != 0) { + l3->count += perflevels[pl].physicalcpu_max / perflevels[pl].cpusperl3; + } + } + + if (l1i->count != 0) { + l1i->caches = calloc(l1i->count, sizeof(*(l1i->caches))); + if (l1i->caches == NULL) { + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1I caches", + l1i->count * sizeof(*(l1i->caches)), l1i->count); + return false; + } + for (uint32_t pl=0; plcaches[c] = (struct cpuinfo_cache) { + .size = perflevels[pl].l1icachesize, + .associativity = l1_cache_associativity, + .sets = perflevels[pl].l1icachesize / (l1_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = c * threads_per_l1, + .processor_count = threads_per_l1, + }; + } + } + } + } + + if (l1d->count != 0) { + l1d->caches = calloc(l1d->count, sizeof(*(l1d->caches))); + if (l1d->caches == NULL) { + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1D caches", + l1d->count * sizeof(*(l1d->caches)), l1d->count); + return false; + } + for (uint32_t pl=0; plcaches[c] = (struct cpuinfo_cache) { + .size = perflevels[pl].l1dcachesize, + .associativity = l1_cache_associativity, + .sets = perflevels[pl].l1dcachesize / (l1_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = c * threads_per_l1, + .processor_count = threads_per_l1, + }; + } + } + } + } + + if (l2->count != 0) { + l2->caches = calloc(l2->count, sizeof(*(l2->caches))); + if (l2->caches == NULL) { + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L2 caches", + l2->count * sizeof(*(l2->caches)), l2->count); + return false; + } + uint32_t cache_index = 0; + for (uint32_t pl=0; plcaches[cache_index] = (struct cpuinfo_cache) { + .size = perflevels[pl].l2cachesize, + .associativity = l2_cache_associativity, + .sets = perflevels[pl].l2cachesize / (l2_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = cache_index * threads_per_l2, + .processor_count = threads_per_l2, + }; + } + } + } + } + + if (l3->count != 0) { + l3->caches = calloc(l3->count, sizeof(*(l3->caches))); + if (l3->caches == NULL) { + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L3 caches", + l3->count * sizeof(*(l3->caches)), l3->count); + return false; + } + uint32_t cache_index = 0; + for (uint32_t pl=0; plcaches[cache_index] = (struct cpuinfo_cache) { + .size = perflevels[pl].l3cachesize, + .associativity = l3_cache_associativity, + .sets = perflevels[pl].l3cachesize / (l3_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = cache_index * threads_per_l3, + .processor_count = threads_per_l3, + }; + } + } + } + } + + return true; +} + +void cpuinfo_arm_mach_init(void) { + const uint32_t cpu_family = get_sys_info_by_name("hw.cpufamily"); + const uint32_t cpu_type = get_sys_info_by_name("hw.cputype"); + const uint32_t cpu_subtype = get_sys_info_by_name("hw.cpusubtype"); + + detect_isa(cpu_family, cpu_type, cpu_subtype); + + struct cpuinfo_processor* processors = NULL; + struct cpuinfo_core* cores = NULL; + struct cpuinfo_cluster* clusters = NULL; + struct cpuinfo_package* packages = NULL; + struct cpuinfo_uarch_info* uarchs = NULL; + struct cache_array l1i = {0}; + struct cache_array l1d = {0}; + struct cache_array l2 = {0}; + struct cache_array l3 = {0}; + + struct cpuinfo_mach_topology mach_topology = cpuinfo_mach_detect_topology(); + + /* + * iOS 15 and macOS Monterey 12 added sysctls for specifying different performance + * levels. Probe `hw.nperflevels` to see if they're present. If so, + * read and validate them. + */ + struct mach_perflevel * perflevels = NULL; + const uint32_t nperflevels = get_sys_info_by_name("hw.nperflevels"); + if (nperflevels > 1) { + perflevels = read_perflevels(nperflevels); + + if (!perflevels) { + cpuinfo_log_error("failed to initialize perflevels"); + goto cleanup; + } + + /* Double-check topology counts */ + uint32_t cores = 0; + uint32_t threads = 0; + for (uint32_t i=0; i 0 && perflevels) { + if (nperflevels != num_clusters) { + cpuinfo_log_error("mismatch topology cluster count (%"PRIu32" != %"PRIu32").", + nperflevels, num_clusters); + goto cleanup; + } + } + + clusters = calloc(num_clusters, sizeof(*clusters)); + if (clusters == NULL) { + cpuinfo_log_error( + "failed to allocate %zu bytes for descriptions of %"PRIu32" clusters", + num_clusters * sizeof(*clusters), num_clusters); + goto cleanup; + } + uarchs = calloc(num_clusters, sizeof(*uarchs)); + if (uarchs == NULL) { + cpuinfo_log_error( + "failed to allocate %zu bytes for descriptions of %"PRIu32" uarchs", + num_clusters * sizeof(*uarchs), num_clusters); + goto cleanup; + } + uint32_t cluster_idx = UINT32_MAX; + for (uint32_t i = 0; i < mach_topology.cores; i++) { + if (i == 0 || cores[i].uarch != cores[i - 1].uarch) { + cluster_idx++; + uarchs[cluster_idx] = (struct cpuinfo_uarch_info) { + .uarch = cores[i].uarch, + .processor_count = 1, + .core_count = 1, + }; + clusters[cluster_idx] = (struct cpuinfo_cluster) { + .processor_start = i * threads_per_core, + .processor_count = 1, + .core_start = i, + .core_count = 1, + .cluster_id = cluster_idx, + .package = cores[i].package, + .vendor = cores[i].vendor, + .uarch = cores[i].uarch, + }; + } else { + uarchs[cluster_idx].processor_count++; + uarchs[cluster_idx].core_count++; + clusters[cluster_idx].processor_count++; + clusters[cluster_idx].core_count++; + } + cores[i].cluster = &clusters[cluster_idx]; + } + + for (uint32_t i = 0; i < mach_topology.threads; i++) { + const uint32_t core_id = i / threads_per_core; + processors[i].cluster = cores[core_id].cluster; + } + + for (uint32_t i = 0; i < mach_topology.packages; i++) { + packages[i].cluster_start = 0; + packages[i].cluster_count = num_clusters; + } + + /* Detect and populate caches */ + + /* + * Prefer perflevels to detect caches. Fallback on error or if + * perflevels are not available. + */ + bool cachesDetected = false; + if (nperflevels > 0 && perflevels) { + cachesDetected = detect_caches_using_perflevels(mach_topology, perflevels, nperflevels, &l1i, &l1d, &l2, &l3); + } + + if (!cachesDetected) { + cachesDetected = detect_caches_legacy(mach_topology, &l1i, &l1d, &l2, &l3); + if (!cachesDetected) { + goto cleanup; + } + } + + /* Associate processors with caches */ + + if (l1i.caches && l1i.count > 0) { + for (uint32_t c=0; c 0) { + for (uint32_t c=0; c 0) { + for (uint32_t c=0; c 0) { + for (uint32_t c=0; c #include +#include #include #include @@ -55,26 +56,78 @@ void report_cache( } } +void report_distinct_caches( + uint32_t count, const struct cpuinfo_cache *cache, + uint32_t level, const char *label) +{ + uint32_t similar_count = 0; + uint32_t prev = 0; + for (uint32_t i=0; i 0) { + report_cache(similar_count, &cache[prev], level, label); + } +} + +void debug_print_caches(const char *label, const struct cpuinfo_cache * const cache, uint32_t count) +{ + for (uint32_t i=0; i 1 && 0 == strcmp(argv[1], "-verbose")) { + debug_print_caches("L1I", cpuinfo_get_l1i_caches(), cpuinfo_get_l1i_caches_count()); + debug_print_caches("L1D", cpuinfo_get_l1d_caches(), cpuinfo_get_l1d_caches_count()); + debug_print_caches("L2", cpuinfo_get_l2_caches(), cpuinfo_get_l2_caches_count()); + debug_print_caches("L3", cpuinfo_get_l3_caches(), cpuinfo_get_l3_caches_count()); + debug_print_caches("L4", cpuinfo_get_l4_caches(), cpuinfo_get_l4_caches_count()); + } + printf("Max cache size (upper bound): %"PRIu32" bytes\n", cpuinfo_get_max_cache_size()); if (cpuinfo_get_l1i_caches_count() != 0 && (cpuinfo_get_l1i_cache(0)->flags & CPUINFO_CACHE_UNIFIED) == 0) { - report_cache(cpuinfo_get_l1i_caches_count(), cpuinfo_get_l1i_cache(0), 1, "instruction"); + report_distinct_caches(cpuinfo_get_l1i_caches_count(), cpuinfo_get_l1i_caches(), 1, "instruction"); } if (cpuinfo_get_l1d_caches_count() != 0) { - report_cache(cpuinfo_get_l1d_caches_count(), cpuinfo_get_l1d_cache(0), 1, "data"); + report_distinct_caches(cpuinfo_get_l1d_caches_count(), cpuinfo_get_l1d_caches(), 1, "data"); } if (cpuinfo_get_l2_caches_count() != 0) { - report_cache(cpuinfo_get_l2_caches_count(), cpuinfo_get_l2_cache(0), 2, "data"); + report_distinct_caches(cpuinfo_get_l2_caches_count(), cpuinfo_get_l2_caches(), 2, "data"); } if (cpuinfo_get_l3_caches_count() != 0) { - report_cache(cpuinfo_get_l3_caches_count(), cpuinfo_get_l3_cache(0), 3, "data"); + report_distinct_caches(cpuinfo_get_l3_caches_count(), cpuinfo_get_l3_caches(), 3, "data"); } if (cpuinfo_get_l4_caches_count() != 0) { - report_cache(cpuinfo_get_l4_caches_count(), cpuinfo_get_l4_cache(0), 4, "data"); + report_distinct_caches(cpuinfo_get_l4_caches_count(), cpuinfo_get_l4_caches(), 4, "data"); } }