#include <Windows.h>
#include <intrin.h>
#include <stdio.h>
#include <stdlib.h>
/* If your compiler can't implicitly recongize the instruction
* such as `cpuid` and `rdtsc`, you need to make these two macros
* below to overwrite the complier.
* For example, when you use the Visual C++ 5.0, you need to make
* these two marcos. If you use the Visual Studio 2013, needn't.
*/
//#define cpuid __asm __emit 0fh __asm __emit 0a2h
//#define rdtsc __asm __emit 0fh __asm __emit 031h
#pragma intrinsic(__rdtsc)
int main(void)
{
int time, subtime;
float x = 5.0f;
__asm
{
// Make three warm-up passes through the timing
// routine to make sure that the CPUID and RDTSC
// instruction are ready.
cpuid
rdtsc
mov subtime, eax // Let `subtime` to store lower
// 32 bits data of rdtsc
// TODO:
// You can make some processes which you want to test
// its time consuming.
cpuid
rdtsc
sub eax, subtime
mov subtime, eax // Calculate the result of time consuming.
/* Make three duplications. */
cpuid
rdtsc
mov subtime, eax
cpuid
rdtsc
sub eax, subtime
mov subtime, eax
cpuid
rdtsc
mov subtime, eax
cpuid
rdtsc
sub eax, subtime
mov subtime, eax // Only the last value of subtime is k
// subtime should now represent the overhead
// cost of the MOV and CPUID instrcutions.
fld x // Send the float `x` to the FPU register.
fld x
cpuid // Serialize execution
rdtsc // Read the time stamp to eax
mov time, eax
fdiv // float type division
cpuid // Serialize execution
rdtsc
sub eax, time // Get the difference which is the result of
// CPU cycles.
}
time = time - subtime; // Subtract the overhead
printf("%d\n", time); // Print the total time of divide to screen.
return 0;
}
程序2:为了使结果更加精确,重复多次运行
[C] 纯文本查看复制代码
// This code will find an average number of cycles taken
// to go through a loop. There is no cache warming, so
// all cache effects are included in the cycle count.
// To use this in your own code, simply paste in the six
// marked sections into the designated locations in your code.
#include <stdio.h>
#include <Windows.h>
#include <intrin.h>
// If you use the Visual C++5.0 to compile your code, add these macros.
/* BEING SECTION 1 */
//#define CPUID __asm __emit 0fh __asm __emit 0a2h
//#define RDTSC __asm __emit 0fh __asm __emit 031h
/* END SECTION 1 */
#define SIZE 5
/* BEGIN SECTION 2 */
unsigned FindBase(); // Function Declaration.
/* END SECTION 2 */
int main(int argc, char *argv[])
{
int i;
/* BEGIN SECTION 3 */
unsigned int base = 0, iterations = 0, sum = 0;
unsigned int cycles_high1 = 0, cycles_low1 = 0;
unsigned int cycles_high2 = 0, cycles_low2 = 0;
unsigned __int64 temp_cycles1 = 0;
unsigned __int64 temp_cycles2 = 0;
// Stored signed so it can be converted to a
// double for viewing.
__int64 total_cycles = 0;
double seconds = 0.0L;
unsigned int mhz = 8100000000; // The frequency of CPU working now, it should be your CPU specification!!!
base = FindBase();
/* END SECTION 3 */
for (i = 0; i < SIZE; i++)
{
/* BEGIN SECTION 4 */
__asm
{
pushad
cpuid // Lineral execution.
rdtsc
mov cycles_high1, edx
mov cycles_low1, eax
popad
}
/* END SECTION 4 */
// TODO: User code to be measured is in this section.
Sleep(3000); // Sleep 3 seconds.
/* BEGIN SECTION 5 */
__asm
{
pushad
cpuid // Lineral execution.
rdtsc
mov cycles_high2, edx
mov cycles_low2, eax
popad
}
// Move the cycle counts into 64-bit integers.
// It is easy to understand if you make a computation on your paper with your pen.
temp_cycles1 = ((unsigned __int64)cycles_high1 << 32) | cycles_low1;
temp_cycles2 = ((unsigned __int64)cycles_high2 << 32) | cycles_low2;
// Add to total cycle count
total_cycles += temp_cycles2 - temp_cycles1 - base;
iterations++;
/* END SECTION 5 */
}
// Now the total cycle count and iterations are available to be used as desired.
// Example:
seconds = (double)(total_cycles) / (double)(mhz);
printf("Average cycles per loop: %f\n",
(double)(total_cycles / iterations));
printf("Average seconds per loop: %f\n",
seconds / iterations);
return 0;
}
/* BEGIN SECTION 6 */
// This function used to measure the base time.
unsigned int FindBase()
{
unsigned int base, base_extra = 0;
unsigned int cycles_low, cycles_high;
// The following test run the basic cycle counter to
// find the overhead associated with each cycle
// measurement. It is run multiple times simply
// because the first call to CPUID normally takes
// longer than subsequent calls.
// Typically after the second run the results are
// consistent. It is run three times just to make sure.
__asm
{
pushad
cpuid
rdtsc
mov cycles_high, edx
mov cycles_low, eax
popad
pushad
cpuid
rdtsc
popad
pushad
cpuid
rdtsc
mov cycles_high, edx
mov cycles_low, eax
popad
pushad
cpuid
rdtsc
popad
pushad
cpuid
rdtsc
// You know, `cycles_high` and `cycles_low` are multiply evaluated.
// To make the result more precise.
mov cycles_high, edx
mov cycles_low, eax
popad
pushad
cpuid
rdtsc
sub eax, cycles_low
mov base_extra, eax // Computation.
popad
pushad
cpuid
rdtsc
mov cycles_high, edx
mov cycles_low, eax
popad
pushad
cpuid
rdtsc
sub eax, cycles_low
mov base, eax // Computation, again.
popad
}
// The following provides insurance for the above code,
// in the instance the final test causes a miss to
// the instruction cache.
if (base_extra < base)
{
base = base_extra;
}
return base;
}
/* END SECTION 6 */
程序3:讨论利用小段代码和重复计算结果来消除Cache effects
[C] 纯文本查看复制代码
// Code for testing a small, stand-alone section of code
// for repeatable results.
#include <stdio.h>
#include <Windows.h>
#include <intrin.h>
// ....You know, if you use the msvc5.0, please add these two marcos.
//#define CPUID __asm __emit 0fh __asm __emit 0a2h
//#define RDTSC __asm __emit 0fh __asm __emit 031h
unsigned int TestFunc();
int main(int argc, char *argv[])
{
unsigned int base = 0;
unsigned int cyc, cycles1, cycles2, cycles3;
unsigned int cycles4, cycles5;
// The following tests run the basic cycle counter to
// find the overhead associated with each cycle
// measurement.
// It is run multiple times simply because the first
// call to CPUID normally takes longer than subsequent
// calls. Typically after the second run the results
// consistent. It is run three times just to make sure.
__asm
{
cpuid
rdtsc
mov cyc, eax
cpuid
rdtsc
sub eax, cyc
mov base, eax
cpuid
rdtsc
mov cyc, eax
cpuid
rdtsc
sub eax, cyc
mov base, eax
cpuid
rdtsc
mov cyc, eax
cpuid
rdtsc
sub eax, cyc
mov base, eax
}
// This section calls the function that contains the
// user's code. It is called multiple times to
// eliminate instruction cache effects and get
// repeatable results.
cycles1 = TestFunc();
cycles2 = TestFunc();
cycles3 = TestFunc();
cycles4 = TestFunc();
cycles5 = TestFunc();
// By the second or third run, both data and instruction
// cache effects should have been eliminated, and
// results will be consistent.
printf("Base : %d\n", base);
printf("Cycle counts:\n");
printf("%d\n", cycles1 - base);
printf("%d\n", cycles2 - base);
printf("%d\n", cycles3 - base);
printf("%d\n", cycles4 - base);
printf("%d\n", cycles5 - base);
return 0;
}
unsigned int TestFunc()
{
// TEST CODE, DO WHAT YOU WANT.
float z, q, x, y;
float result;
unsigned cycles;
// Load the values here, not at creation, to make sure
// the data is moved into the cache.
// Actually, I have a question here????????
// According to the previous pages,
// this varibles should be defined in this section.
cycles = 0;
result = 0.0f;
x = 2.0f;
y = 100.0f;
z = 12.0f;
q = 5.0f;
__asm
{
cpuid
rdtsc
mov cycles, eax
}
// process
z += y;
q *= x;
result = z / q;
__asm
{
cpuid
rdtsc
sub eax, cycles
mov cycles, eax
}
return cycles;
}