Skip to content

Commit

Permalink
Merge pull request #189 from opcm/09-Mar-20-push
Browse files Browse the repository at this point in the history
09 mar 20 push
  • Loading branch information
opcm authored Mar 9, 2020
2 parents f0f245e + 308bea9 commit 4e3c199
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 492 deletions.
15 changes: 8 additions & 7 deletions cpucounters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#include <sys/sysctl.h>
#include <sys/sem.h>

// convertUnknownToInt is used in the safe sysctl call to convert an unkown size to an int
// convertUnknownToInt is used in the safe sysctl call to convert an unknown size to an int
int convertUnknownToInt(size_t size, char* value);

#endif
Expand All @@ -110,6 +110,7 @@ bool PCM::initWinRing0Lib()

if (result == FALSE)
{
CloseHandle(hOpenLibSys);
hOpenLibSys = NULL;
return false;
}
Expand Down Expand Up @@ -949,6 +950,7 @@ bool PCM::discoverSystemTopology()
}
}

num_online_cores = num_cores;

if (num_cores != GetActiveProcessorCount(ALL_PROCESSOR_GROUPS))
{
Expand Down Expand Up @@ -1202,13 +1204,14 @@ bool PCM::discoverSystemTopology()
std::cerr << topology[i].socket << " " << topology[i].os_id << " " << topology[i].core_id << std::endl;
}
#endif
if(threads_per_core == 0)
if (threads_per_core == 0)
{
for (int i = 0; i < (int)num_cores; ++i)
{
if(topology[i].socket == topology[0].socket && topology[i].core_id == topology[0].core_id)
if (topology[i].socket == topology[0].socket && topology[i].core_id == topology[0].core_id)
++threads_per_core;
}
assert(threads_per_core != 0);
}
if(num_phys_cores_per_socket == 0 && num_cores == num_online_cores) num_phys_cores_per_socket = num_cores / num_sockets / threads_per_core;
if(num_online_cores == 0) num_online_cores = num_cores;
Expand Down Expand Up @@ -1724,8 +1727,6 @@ PCM::PCM() :
L3CacheHitsNoSnoopAvailable(false),
L3CacheHitsSnoopAvailable(false),
L3CacheHitsAvailable(false),
CyclesLostDueL3CacheMissesAvailable(false),
CyclesLostDueL2CacheMissesAvailable(false),
forceRTMAbortMode(false),
mode(INVALID_MODE),
numInstancesSemaphore(NULL),
Expand Down Expand Up @@ -3110,7 +3111,7 @@ void PCM::cleanupUncorePMUs()

void PCM::resetPMU()
{
for (int i = 0; i < (int)num_cores; ++i)
for (int i = 0; i < (int)MSR.size(); ++i)
{
// disable all counters
MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, 0);
Expand Down Expand Up @@ -6279,7 +6280,7 @@ void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_

void PCM::programCbo(const uint64 * events, const uint32 opCode, const uint32 nc_, const uint32 llc_lookup_tid_filter, const uint32 loc, const uint32 rem)
{
for (int32 i = 0; (i < num_sockets) && MSR.size(); ++i)
for (size_t i = 0; (i < cboPMUs.size()) && MSR.size(); ++i)
{
uint32 refCore = socketRefCore[i];
TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux
Expand Down
107 changes: 8 additions & 99 deletions cpucounters.h
Original file line number Diff line number Diff line change
Expand Up @@ -605,8 +605,6 @@ class PCM_API PCM
bool L3CacheHitsNoSnoopAvailable;
bool L3CacheHitsSnoopAvailable;
bool L3CacheHitsAvailable;
bool CyclesLostDueL3CacheMissesAvailable;
bool CyclesLostDueL2CacheMissesAvailable;

bool forceRTMAbortMode;

Expand Down Expand Up @@ -1840,8 +1838,6 @@ class PCM_API PCM
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL3CacheMissesAvailable) // deprecated
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL2CacheMissesAvailable) // deprecated

#undef PCM_GEN_METRIC_AVAILABLE_FUNCTION

Expand Down Expand Up @@ -1888,10 +1884,6 @@ class BasicCounterState
template <class CounterStateType>
friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
template <class CounterStateType>
friend double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
template <class CounterStateType>
friend double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
template <class CounterStateType>
friend uint64 getL3CacheOccupancy(const CounterStateType & now);
template <class CounterStateType>
friend uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after);
Expand Down Expand Up @@ -2710,50 +2702,6 @@ double getActiveRelativeFrequency(const CounterStateType & before, const Counter
return -1;
}

/*! \brief Estimates how many core cycles were potentially lost due to L3 cache misses.
\param before CPU counter state before the experiment
\param after CPU counter state after the experiment
\warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
\return ratio that is usually beetween 0 and 1 ; in some cases could be >1.0 due to a lower memory latency estimation
*/
template <class CounterStateType>
double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
{
auto pcm = PCM::getInstance();
const int cpu_model = pcm->getCPUModel();
if (pcm->isAtom() || cpu_model == PCM::KNL) return -1;
int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
if (clocks != 0)
{
return 180. * double(after.L3Miss - before.L3Miss) / double(clocks);
}
return -1;
}

/*! \brief Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cache
\param before CPU counter state before the experiment
\param after CPU counter state after the experiment
\warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
\warning Currently not supported on Intel(R) Atom(tm) processor
\return ratio that is usually beetween 0 and 1 ; in some cases could be >1.0 due to a lower access latency estimation
*/
template <class CounterStateType>
double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
{
auto pcm = PCM::getInstance();
if (pcm->isAtom() || pcm->getCPUModel() == PCM::KNL || pcm->useSkylakeEvents()) return -1;
int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
if (clocks != 0)
{
double L3UnsharedHit = (double)(after.L3UnsharedHit - before.L3UnsharedHit);
double L2HitM = (double)(after.L2HitM - before.L2HitM);
return (35. * L3UnsharedHit + 74. * L2HitM) / double(clocks);
}
return -1;
}

/*! \brief Computes L2 cache hit ratio
\param before CPU counter state before the experiment
Expand All @@ -2762,35 +2710,11 @@ double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const Coun
\return value between 0 and 1
*/
template <class CounterStateType>
double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
double getL2CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
{
auto pcm = PCM::getInstance();
if (pcm->useSkylakeEvents()) {
uint64 L2Hit = after.L2Hit - before.L2Hit;
uint64 L2Ref = L2Hit + after.SKLL2Miss - before.SKLL2Miss;
if (L2Ref) {
return double(L2Hit) / double(L2Ref);
}
return 1;
}
if (pcm->isAtom() || pcm->getCPUModel() == PCM::KNL)
{
uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
if (L2Ref) {
return 1. - (double(L2Miss) / double(L2Ref));
}
return 1;
}
uint64 L3Miss = after.L3Miss - before.L3Miss;
uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
uint64 L2HitM = after.L2HitM - before.L2HitM;
uint64 L2Hit = after.L2Hit - before.L2Hit;
uint64 hits = L2Hit;
uint64 all = L2Hit + L2HitM + L3UnsharedHit + L3Miss;
if (all) return double(hits) / double(all);

return 1;
const auto hits = getL2CacheHits(before, after);
const auto misses = getL2CacheMisses(before, after);
return double(hits) / double(hits + misses);
}

/*! \brief Computes L3 cache hit ratio
Expand All @@ -2801,26 +2725,11 @@ double getL2CacheHitRatio(const CounterStateType & before, const CounterStateTyp
\return value between 0 and 1
*/
template <class CounterStateType>
double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
double getL3CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
{
if (!PCM::getInstance()->isL3CacheHitRatioAvailable()) return -1;
if (PCM::getInstance()->useSkylakeEvents()) {
uint64 L3Hit = after.SKLL3Hit - before.SKLL3Hit;
uint64 L3Ref = L3Hit + after.L3Miss - before.L3Miss;
if (L3Ref) {
return double(L3Hit) / double(L3Ref);
}
return 1;
}

uint64 L3Miss = after.L3Miss - before.L3Miss;
uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
uint64 L2HitM = after.L2HitM - before.L2HitM;
uint64 hits = L3UnsharedHit + L2HitM;
uint64 all = L2HitM + L3UnsharedHit + L3Miss;
if (all) return double(hits) / double(all);

return 1;
const auto hits = getL3CacheHits(before, after);
const auto misses = getL3CacheMisses(before, after);
return double(hits) / double(hits + misses);
}

/*! \brief Computes number of L3 cache misses
Expand Down
Loading

0 comments on commit 4e3c199

Please sign in to comment.