/* lperfex: A workalike of the IRIX perfex(1) command for Linux systems running on Intel P6 core (PPro/PII/PIII/Celeron/Xeon) processors. Copyright (C) 1999 Ohio Supercomputer Center. This code is licensed under version 2 or later of the GNU GPL; see /usr/src/linux/COPYING for details. Author: Troy Baer Science and Technology Support Ohio Supercomputer Center troy@osc.edu http://www.osc.edu/~troy/ This code relies on Erik Hendriks' perf patch and library v0.7 for the Linux kernel. The patch is available from: ftp://beowulf.gsfc.nasa.gov/pub/software/perf-0.7.tar.gz To compile: gcc -o lperfex lperfex.c -I/usr/local/include -L/usr/local/lib -lperf -lm (Assuming your sysadmin has installed libperf.a and perf.h into /usr/local/lib and /usr/local/include, respectively.) Version History: v0.1 released 01 Oct 1999; first public release. v0.2 released 21 Dec 1999; bugfix release. v0.3 released 19 Jun 2000; added features. To do: * Support of more of the IRIX perfex command line options, especially counter multiplexing and counting inside multithreaded code (assuming these are even possible). * More and better report statistics */ #include #include #include #include #include #include #include #include /* P6 core countable events: 0: Memory references 1: L1 data cache lines loaded 2: L1 data cache lines loaded and modified 3: L1 data cache lines flushed 4: Weighed number of cycles spent waiting while a L1 data cache miss is resolved 5: Instruction fetches 6: L1 instruction cache misses 7: ITLB misses 8: Cycles spent waiting for instruction fetches and ITLB misses 9: Cycles spent waiting on the instruction decoder 10: L2 cache instruction fetches 11: L2 cache data loads 12: L2 cache data stores 13: L2 cache lines loaded 14: L2 cache lines flushed 15: L2 cache lines loaded and modified 16: L2 cache lines modified and flushed 17: L2 cache requests 18: L2 cache address strobes 19: Cycles spent waiting on the L2 data bus 20: Cycles spent waiting on data transfer from L2 cache to processor 21: Cycles spent while DRDY is asserted 22: Cycles spent while LOCK is asserted 23: Bus requests outstanding 24: Burst read transactions 25: Read-for-ownership transactions 26: Write-back transactions 27: Instruction fetch transactions 28: Invalidate transactions 29: Partial-write transactions 30: Partial transactions 31: I/O transactions 32: Deferred transactions 33: Burst transactions 34: Total number of transactions 35: Memory transactions 36: Bus clock cycles spent while the processor is receiving data 37: Bus clock cycles spent while the processor is driving the BNR pin 38: Bus clock cycles spent while the processor is driving the HIT pin 39: Bus clock cycles spent while the processor is driving the HITM pin 40: Cycles spent while the bus is snoop-stalled 41: Floating point operations retired (counter 0 only) 42: Floating point operations executed (counter 0 only) 43: Floating point exceptions handled by microcode (counter 1 only) 44: Multiply operations (counter 1 only) 45: Divide operations (counter 1 only) 46: Cycles spent doing division (counter 0 only) 47: Store buffer blocks 48: Store buffer drain cycles 49: Misaligned memory references 50: Instructions retired 51: uOps retired 52: Instructions decoded 53: Hardware interrupts received 54: Cycles spent while interrupts are disabled 55: Cycles spent while interrupts and disabled and pending 56: Branch instructions retired 57: Mispredicted branches retired 58: Taken branches retired 59: Taken mispredicted branches retired 60: Branch instructions decoded 61: Branches which miss the BTB 62: Bogus branches 63: BACLEAR assertions 64: Cycles spent during resource related stalls 65: Cycles spent during partial stalls 66: Segment register loads 67: Cycles during which the processor is not halted */ int event[68] = { PERF_DATA_MEM_REFS, PERF_DCU_LINES_IN, PERF_DCU_M_LINES_IN, PERF_DCU_M_LINES_OUT, PERF_DCU_MISS_STANDING, PERF_IFU_IFETCH, PERF_IFU_IFETCH_MISS, PERF_ITLB_MISS, PERF_IFU_MEM_STALL, PERF_ILD_STALL, PERF_L2_IFETCH, PERF_L2_LD, PERF_L2_ST, PERF_L2_LINES_IN, PERF_L2_LINES_OUT, PERF_L2_LINES_INM, PERF_L2_LINES_OUTM, PERF_L2_RQSTS, PERF_L2_ADS, PERF_L2_DBUS_BUSY, PERF_L2_DBUS_BUSY_RD, PERF_BUS_DRDY_CLOCKS, PERF_BUS_LOCK_CLOCKS, PERF_BUS_REQ_OUTSTANDING, PERF_BUS_TRAN_BRD, PERF_BUS_TRAN_RFO, PERF_BUS_TRANS_WB, PERF_BUS_TRAN_IFETCH, PERF_BUS_TRAN_INVAL, PERF_BUS_TRAN_PWR, PERF_BUS_TRAN_P, PERF_BUS_TRANS_IO, PERF_BUS_TRAN_DEF, PERF_BUS_TRAN_BURST, PERF_BUS_TRAN_ANY, PERF_BUS_TRAN_MEM, PERF_BUS_DATA_RCV, PERF_BUS_BNR_DRV, PERF_BUS_HIT_DRV, PERF_BUS_HITM_DRV, PERF_BUS_SNOOP_STALL, PERF_FLOPS, PERF_FP_COMP_OPS_EXE, PERF_FP_ASSIST, PERF_MUL, PERF_DIV, PERF_CYCLES_DIV_BUSY, PERF_LD_BLOCK, PERF_SB_DRAINS, PERF_MISALIGN_MEM_REF, PERF_INST_RETIRED, PERF_UOPS_RETIRED, PERF_INST_DECODER, PERF_HW_INT_RX, PERF_CYCLES_INST_MASKED, PERF_CYCLES_INT_PENDING_AND_MASKED, PERF_BR_INST_RETIRED, PERF_BR_MISS_PRED_RETIRED, PERF_BR_TAKEN_RETIRED, PERF_BR_MISS_PRED_TAKEN_RET, PERF_BR_INST_DECODED, PERF_BR_BTB_MISSES, PERF_BR_BOGUS, PERF_BACLEARS, PERF_RESOURCE_STALLS, PERF_PARTIAL_RAT_STALLS, PERF_SEGMENT_REG_LOADS, PERF_CPU_CLK_UNHALTED }; char label[68][81] = { "Memory references", "L1 data cache lines loaded", "L1 data cache lines loaded and modified", "L1 data cache lines flushed", "Weighed number of cycles spent waiting while a L1 data cache miss is resolved", "Instruction fetches", "L1 instruction cache misses", "ITLB misses", "Cycles spent waiting for instruction fetches and ITLB misses", "Cycles spent waiting on the instruction decoder", "L2 cache instruction fetches", "L2 cache data loads", "L2 cache data stores", "L2 cache lines loaded", "L2 cache lines flushed", "L2 cache lines loaded and modified", "L2 cache lines modified and flushed", "L2 cache requests", "L2 cache address strobes", "Cycles spent waiting on the L2 data bus", "Cycles spent waiting on data transfer from L2 cache to processor", "Cycles spent while DRDY is asserted", "Cycles spent while LOCK is asserted", "Bus requests outstanding", "Burst read transactions", "Read-for-ownership transactions", "Write-back transactions", "Instruction fetch transactions", "Invalidate transactions", "Partial-write transactions", "Partial transactions", "I/O transactions", "Deferred transactions", "Burst transactions", "Total number of transactions", "Memory transactions", "Bus clock cycles spent while the processor is receiving data", "Bus clock cycles spent while the processor is driving the BNR pin", "Bus clock cycles spent while the processor is driving the HIT pin", "Bus clock cycles spent while the processor is driving the HITM pin", "Cycles spent while the bus is snoop-stalled", "Floating point operations retired (counter 0 only)", "Floating point operations executed (counter 0 only)", "Floating point exceptions handled by microcode (counter 1 only)", "Multiply operations (counter 1 only)", "Divide operations (counter 1 only)", "Cycles spent doing division (counter 0 only)", "Store buffer blocks", "Store buffer drain cycles", "Misaligned memory references", "Instructions retired", "uOps retired", "Instructions decoded", "Hardware interrupts received", "Cycles spent while interrupts are disabled", "Cycles spent while interrupts and disabled and pending", "Branch instructions retired", "Mispredicted branches retired", "Taken branches retired", "Taken mispredicted branches retired", "Branch instructions decoded", "Branches which miss the BTB", "Bogus branches", "BACLEAR assertions", "Cycles spent during resource related stalls", "Cycles spent during partial stalls", "Segment register loads", "Cycles during which the processor is not halted" }; #define TIMEVAL_TO_DOUBLE(x) ((double)((x).tv_sec)+0.000001*(double)((x).tv_usec)) /* The following assumes a 550 MHz clock. Compile with -DCLOCKSPEED=xxx to override. */ #define CLOCKSPEED 550 int main(int argc, char *argv[], char *arge[]) { int event0=-1, event1=-1, mplex=0, mkrpt=0, i, status; FILE *extfile=stdout; double telapsed,cycles; pid_t child; struct rusage ru; unsigned long long counter[PERF_COUNTERS]; extern char *optarg; extern int optind; char hostname[80]; /* Parse command line options */ gethostname(&hostname,80); extfile=stdout; while ((i=getopt(argc,argv,"ae:kmpo:sxy"))!=-1) { switch(i) { case 'a': mplex=1; break; case 'e': mplex=0; if (event0==-1) { sscanf(optarg,"%d",&event0); } else { sscanf(optarg,"%d",&event1); } break; case 'k': fprintf(stderr,"lperfex: Kernel counting not supported\n"); break; case 'm': fprintf(stderr,"lperfex: Multithreaded counting not supported\n"); break; case 'o': extfile=fopen(optarg,"w"); break; case 's': fprintf(stderr,"lperfex: Signalled counting no supported\n"); break; case 'x': fprintf(stderr,"lperfex: Exception counting no supported\n"); break; case 'y': mkrpt=1; break; case '?': case 'h': printf("Usage: lperfex [-a | -e event0 [-e event1]] [-mp | -s] [-x] [-k] [-y] [-o file] [-c file] command [args]\n"); printf("\nEvent numbers: \n"); for (i=0;i<68;i++) printf("\t%2d: %s\n",i,label[i]); exit(-1); } } /* Set the counters up and run the child program */ perf_reset(); if ((child=fork())==0) { /* Child */ perf_reset(); if (mplex==0) { if (event0==-1) { event0=42; /* Default to counting MFLOPS */ } perf_set_config(0,event[event0]); if (event1==-1) { event1=13; /* Default to count L2 cache line loads */ } perf_set_config(1,event[event1]); } else { fprintf(stderr,"lperfex: Multiplexing of counters currently not supported\n"); exit(-2); } perf_start(); execve(argv[optind],argv[optind+1],arge); } else { /* Parent */ perf_wait(child,&status,0,&ru,&counter); perf_stop(); telapsed=TIMEVAL_TO_DOUBLE(ru.ru_utime)+TIMEVAL_TO_DOUBLE(ru.ru_stime); cycles=floor((double)CLOCKSPEED*(double)1000000*telapsed); fprintf(extfile,"%lf seconds of CPU time elapsed (%le cycles) and %lf MB of memory on %s\n\n",telapsed,cycles,((double)ru.ru_maxrss)/1024.,hostname); fprintf(extfile,"Event #\t\t\tEvent\t\t\t\t\t\t\t\t\tEvents Counted\n"); fprintf(extfile,"-------\t\t\t-----\t\t\t\t\t\t\t\t\t--------------\n"); if (event0!=-1) fprintf(extfile," %2d \t%-80s\t%14llu\n",event0,label[event0],counter[0]); if (event1!=-1) fprintf(extfile," %2d \t%-80s\t%14llu\n",event1,label[event1],counter[1]); if (mkrpt) { fprintf(extfile,"\nStatistics:\n"); fprintf(extfile,"-----------\n"); if (event[event0]==PERF_INST_DECODER || event[event0]==PERF_INST_RETIRED) fprintf(extfile,"MIPS\t\t\t%14lf\n",1.0e-6*((double)counter[0])/(double)telapsed); else if (event[event1]==PERF_INST_DECODER || event[event1]==PERF_INST_RETIRED) fprintf(extfile,"MIPS\t\t\t%14lf\n",1.0e-6*((double)counter[1])/(double)telapsed); if (event[event0]==PERF_FLOPS || event[event0]==PERF_FP_COMP_OPS_EXE) { fprintf(extfile,"MFLOPS\t\t\t%14lf\n",1.0e-6*((double)counter[0])/(double)telapsed); if (event[event1]==PERF_MUL) fprintf(extfile,"Multiplications/FP op\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DIV) fprintf(extfile,"Divisions/FP op\t\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_INST_DECODER || event[event1]==PERF_INST_RETIRED) fprintf(extfile,"Instructions/FP op\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_CPU_CLK_UNHALTED) fprintf(extfile,"Unhalted cycles/FP op\t\t%14lf\n",((double)counter[1])/((double)counter[0])); } if (event[event0]==PERF_CYCLES_DIV_BUSY && event[event1]==PERF_DIV) fprintf(extfile,"Cycles/divide op\t\t%14lf\n",((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DCU_LINES_IN) fprintf(extfile,"L2 cache -> L1 Dcache bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_DCU_LINES_IN) fprintf(extfile,"L2 cache ->L1 Dcache bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_DCU_M_LINES_OUT) fprintf(extfile,"L1 Dcache -> L2 cache bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_DCU_M_LINES_OUT) fprintf(extfile,"L1 Dcache -> L2 cache bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"Main memory -> L2 cache bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"Main memory -> L2 cache bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_L2_LINES_OUT) fprintf(extfile,"L2 cache -> main memory bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_L2_LINES_OUT) fprintf(extfile,"L2 cache -> main memory bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_DATA_MEM_REFS && event[event1]==PERF_DCU_LINES_IN) fprintf(extfile,"L1 data cache hit rate\t%14lf\n",1.-((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DATA_MEM_REFS && event[event0]==PERF_DCU_LINES_IN) fprintf(extfile,"L1 data cache hit rate\t%14lf\n",1.-((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DATA_MEM_REFS && event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"L2 cache hit rate\t\t%14lf\n",1.-((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DATA_MEM_REFS && event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"L2 cache hit rate\t\t%14lf\n",1.-((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DCU_LINES_IN && event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"L1 data cache miss/L2 cache miss\t%14lf\n",((double)counter[0])/((double)counter[1])); else if (event[event1]==PERF_DCU_LINES_IN && event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"L1 data cache miss/L2 cache miss\t%14lf\n",((double)counter[1])/((double)counter[0])); if (event[event0]==PERF_CPU_CLK_UNHALTED) fprintf(extfile,"Fraction of cycles spent unhalted\t%14lf\n",((double)counter[0]/cycles)); else if (event[event1]==PERF_CPU_CLK_UNHALTED) fprintf(extfile,"Fraction of cycles spent unhalted\t%14lf\n",((double)counter[1]/cycles)); if (event[event0]==PERF_DCU_MISS_STANDING) fprintf(extfile,"Fraction of cycles spent waiting on L1 cache\t%14lf\n",((double)counter[0])/cycles); else if (event[event1]==PERF_DCU_MISS_STANDING) fprintf(extfile,"Fraction of cycles spent waiting on L1 cache\t%14lf\n",((double)counter[1])/cycles); if (event[event0]==PERF_L2_DBUS_BUSY) fprintf(extfile,"Fraction of cycles spent waiting on L2 data bus\t%14lf\n",((double)counter[0])/cycles); else if (event[event1]==PERF_L2_DBUS_BUSY) fprintf(extfile,"Fraction of cycles spent waiting on L2 data bus\t%14lf\n",((double)counter[1])/cycles); if (event[event0]==PERF_L2_DBUS_BUSY_RD) fprintf(extfile,"Fraction of cycles spent waiting on L2 data transfers\t%14lf\n",((double)counter[0])/cycles); else if (event[event1]==PERF_L2_DBUS_BUSY_RD) fprintf(extfile,"Fraction of cycles spent waiting on L2 data transfers\t%14lf\n",((double)counter[1])/cycles); if (event[event0]==PERF_RESOURCE_STALLS) fprintf(extfile,"Fraction of cycles spent on resource stalls\t%14lf\n",((double)counter[0])/cycles); else if (event[event1]==PERF_RESOURCE_STALLS) fprintf(extfile,"Fraction of cycles spent on resource stalls\t%14lf\n",((double)counter[1])/cycles); if (event[event0]==PERF_PARTIAL_RAT_STALLS) fprintf(extfile,"Fraction of cycles spent on partial stalls\t%14lf\n",((double)counter[0])/cycles); else if (event[event1]==PERF_PARTIAL_RAT_STALLS) fprintf(extfile,"Fraction of cycles spent on partial stalls\t%14lf\n",((double)counter[1])/cycles); } } }