/* lperfex: A workalike of the IRIX perfex(1) command for Linux systems running on Intel P6 core (PPro/PII/PIII/Celeron/Xeon) processors. Copyright (C) 1999 Ohio Supercomputer Center. This code is licensed under version 2 or later of the GNU GPL; see /usr/src/linux/COPYING for details. Author: Troy Baer Science and Technology Support Ohio Supercomputer Center troy@osc.edu http://www.osc.edu/~troy/ This code relies on Erik Hendriks' perf patch and library v0.7 for the Linux kernel. The patch is available from: ftp://beowulf.gsfc.nasa.gov/pub/software/perf-0.7.tar.gz To compile: gcc -o lperfex lperfex.c -I/usr/local/include -L/usr/local/lib -lperf (Assuming your sysadmin has installed libperf.a and perf.h into /usr/local/lib and /usr/local/include, respectively.) Version History: v0.1 released 01 Oct 1999; first public release. v0.2 fixed %lu bug and other small bugs To do: * Support of more of the IRIX perfex command line options, especially counter multiplexing and counting inside multithreaded code (assuming these are even possible). * More and better report statistics (memory->L2 bandwidth is currently computed in a *very* stupid way) */ #include #include #include #include #include #include #include #include #include #include /* P6 core countable events: 0: Memory references 1: L1 data cache lines loaded 2: L1 data cache lines loaded and modified 3: L1 data cache lines flushed 4: Weighed number of cycles spent waiting while a L1 data cache miss is resolved 5: Instruction fetches 6: L1 instruction cache misses 7: ITLB misses 8: Cycles spent waiting for instruction fetches and ITLB misses 9: Cycles spent waiting on the instruction decoder 10: L2 cache instruction fetches 11: L2 cache data loads 12: L2 cache data stores 13: L2 cache lines loaded 14: L2 cache lines flushed 15: L2 cache lines loaded and modified 16: L2 cache lines modified and flushed 17: L2 cache requests 18: L2 cache address strobes 19: Cycles spent waiting on the L2 data bus 20: Cycles spent waiting on data transfer from L2 cache to processor 21: Cycles spent while DRDY is asserted 22: Cycles spent while LOCK is asserted 23: Bus requests outstanding 24: Burst read transactions 25: Read-for-ownership transactions 26: Write-back transactions 27: Instruction fetch transactions 28: Invalidate transactions 29: Partial-write transactions 30: Partial transactions 31: I/O transactions 32: Deferred transactions 33: Burst transactions 34: Total number of transactions 35: Memory transactions 36: Bus clock cycles spent while the processor is receiving data 37: Bus clock cycles spent while the processor is driving the BNR pin 38: Bus clock cycles spent while the processor is driving the HIT pin 39: Bus clock cycles spent while the processor is driving the HITM pin 40: Cycles spent while the bus is snoop-stalled 41: Floating point operations retired (counter 0 only) 42: Floating point operations executed (counter 0 only) 43: Floating point exceptions handled by microcode (counter 1 only) 44: Multiply operations (counter 1 only) 45: Divide operations (counter 1 only) 46: Cycles spent doing division (counter 0 only) 47: Store buffer blocks 48: Store buffer drain cycles 49: Misaligned memory references 50: Instructions retired 51: uOps retired 52: Instructions decoded 53: Hardware interrupts received 54: Cycles spent while interrupts are disabled 55: Cycles spent while interrupts and disabled and pending 56: Branch instructions retired 57: Mispredicted branches retired 58: Taken branches retired 59: Taken mispredicted branches retired 60: Branch instructions decoded 61: Branches which miss the BTB 62: Bogus branches 63: BACLEAR assertions 64: Cycles spent during resource related stalls 65: Cycles spent during partial stalls 66: Segment register loads 67: Cycles during which the processor is not halted */ int event[68] = { PERF_DATA_MEM_REFS, PERF_DCU_LINES_IN, PERF_DCU_M_LINES_IN, PERF_DCU_M_LINES_OUT, PERF_DCU_MISS_STANDING, PERF_IFU_IFETCH, PERF_IFU_IFETCH_MISS, PERF_ITLB_MISS, PERF_IFU_MEM_STALL, PERF_ILD_STALL, PERF_L2_IFETCH, PERF_L2_LD, PERF_L2_ST, PERF_L2_LINES_IN, PERF_L2_LINES_OUT, PERF_L2_LINES_INM, PERF_L2_LINES_OUTM, PERF_L2_RQSTS, PERF_L2_ADS, PERF_L2_DBUS_BUSY, PERF_L2_DBUS_BUSY_RD, PERF_BUS_DRDY_CLOCKS, PERF_BUS_LOCK_CLOCKS, PERF_BUS_REQ_OUTSTANDING, PERF_BUS_TRAN_BRD, PERF_BUS_TRAN_RFO, PERF_BUS_TRANS_WB, PERF_BUS_TRAN_IFETCH, PERF_BUS_TRAN_INVAL, PERF_BUS_TRAN_PWR, PERF_BUS_TRAN_P, PERF_BUS_TRANS_IO, PERF_BUS_TRAN_DEF, PERF_BUS_TRAN_BURST, PERF_BUS_TRAN_ANY, PERF_BUS_TRAN_MEM, PERF_BUS_DATA_RCV, PERF_BUS_BNR_DRV, PERF_BUS_HIT_DRV, PERF_BUS_HITM_DRV, PERF_BUS_SNOOP_STALL, PERF_FLOPS, PERF_FP_COMP_OPS_EXE, PERF_FP_ASSIST, PERF_MUL, PERF_DIV, PERF_CYCLES_DIV_BUSY, PERF_LD_BLOCK, PERF_SB_DRAINS, PERF_MISALIGN_MEM_REF, PERF_INST_RETIRED, PERF_UOPS_RETIRED, PERF_INST_DECODER, PERF_HW_INT_RX, PERF_CYCLES_INST_MASKED, PERF_CYCLES_INT_PENDING_AND_MASKED, PERF_BR_INST_RETIRED, PERF_BR_MISS_PRED_RETIRED, PERF_BR_TAKEN_RETIRED, PERF_BR_MISS_PRED_TAKEN_RET, PERF_BR_INST_DECODED, PERF_BR_BTB_MISSES, PERF_BR_BOGUS, PERF_BACLEARS, PERF_RESOURCE_STALLS, PERF_PARTIAL_RAT_STALLS, PERF_SEGMENT_REG_LOADS, PERF_CPU_CLK_UNHALTED }; char label[68][81] = { "Memory references", "L1 data cache lines loaded", "L1 data cache lines loaded and modified", "L1 data cache lines flushed", "Weighed number of cycles spent waiting while a L1 data cache miss is resolved", "Instruction fetches", "L1 instruction cache misses", "ITLB misses", "Cycles spent waiting for instruction fetches and ITLB misses", "Cycles spent waiting on the instruction decoder", "L2 cache instruction fetches", "L2 cache data loads", "L2 cache data stores", "L2 cache lines loaded", "L2 cache lines flushed", "L2 cache lines loaded and modified", "L2 cache lines modified and flushed", "L2 cache requests", "L2 cache address strobes", "Cycles spent waiting on the L2 data bus", "Cycles spent waiting on data transfer from L2 cache to processor", "Cycles spent while DRDY is asserted", "Cycles spent while LOCK is asserted", "Bus requests outstanding", "Burst read transactions", "Read-for-ownership transactions", "Write-back transactions", "Instruction fetch transactions", "Invalidate transactions", "Partial-write transactions", "Partial transactions", "I/O transactions", "Deferred transactions", "Burst transactions", "Total number of transactions", "Memory transactions", "Bus clock cycles spent while the processor is receiving data", "Bus clock cycles spent while the processor is driving the BNR pin", "Bus clock cycles spent while the processor is driving the HIT pin", "Bus clock cycles spent while the processor is driving the HITM pin", "Cycles spent while the bus is snoop-stalled", "Floating point operations retired (counter 0 only)", "Floating point operations executed (counter 0 only)", "Floating point exceptions handled by microcode (counter 1 only)", "Multiply operations (counter 1 only)", "Divide operations (counter 1 only)", "Cycles spent doing division (counter 0 only)", "Store buffer blocks", "Store buffer drain cycles", "Misaligned memory references", "Instructions retired", "uOps retired", "Instructions decoded", "Hardware interrupts received", "Cycles spent while interrupts are disabled", "Cycles spent while interrupts and disabled and pending", "Branch instructions retired", "Mispredicted branches retired", "Taken branches retired", "Taken mispredicted branches retired", "Branch instructions decoded", "Branches which miss the BTB", "Bogus branches", "BACLEAR assertions", "Cycles spent during resource related stalls", "Cycles spent during partial stalls", "Segment register loads", "Cycles during which the processor is not halted" }; #define TIMEVAL_TO_FLOAT(x) ((float)((x).tv_sec)+0.000001*(float)((x).tv_usec)) int main(int argc, char *argv[], char *environment[]) { int event0=-1, event1=-1, mplex=0, mkrpt=0, i, status, rval; FILE *extfile=stdout; float telapsed; pid_t child; struct rusage ru; unsigned long long counter[PERF_COUNTERS]; extern char *optarg; extern int optind; char hostname[80]; /* Parse command line options */ gethostname(hostname,80); extfile=stdout; while ((i=getopt(argc,argv,"ae:kmpo:sxy"))!=-1) { switch(i) { case 'a': mplex=1; break; case 'e': mplex=0; if (event0==-1) { sscanf(optarg,"%d",&event0); } else { sscanf(optarg,"%d",&event1); } break; case 'k': fprintf(stderr,"lperfex: Kernel counting not supported\n"); break; case 'm': fprintf(stderr,"lperfex: Multithreaded counting not supported\n"); break; case 'o': extfile=fopen(optarg,"w"); break; case 's': fprintf(stderr,"lperfex: Signalled counting no supported\n"); break; case 'x': fprintf(stderr,"lperfex: Exception counting no supported\n"); break; case 'y': mkrpt=1; break; case '?': case 'h': printf("Usage: lperfex [-a | -e event0 [-e event1]] [-mp | -s] [-x] [-k] [-y] [-o file] [-c file] command [args]\n"); printf("\nEvent numbers: \n"); for (i=0;i<68;i++) printf("\t%2d: %s\n",i,label[i]); exit(-1); } } if (event0==-1) { event0=42; /* Default to counting MFLOPS */ } if (event1==-1) { event1=13; /* Default to count L2 cache line loads */ } /* Set the counters up and run the child program */ perf_reset(); if ((child=fork())==0) { /* Child */ perf_reset(); if (mplex==0) { perf_set_config(0,event[event0]); perf_set_config(1,event[event1]); } else { fprintf(stderr,"lperfex: Multiplexing of counters currently not supported\n"); exit(-2); } perf_start(); execvp(argv[optind],argv+optind); } else { /* Parent */ rval=perf_wait(child,&status,(int)(0),&ru,counter); if ( ( rval != child ) || ( status != 0 ) ) { fprintf(stderr,"Error waiting for process to terminate; perf_wait returned %d, child=%d, status=%d\n",rval,child,status); kill(child,SIGTERM); exit(-3); } perf_stop(); telapsed=TIMEVAL_TO_FLOAT(ru.ru_utime)+TIMEVAL_TO_FLOAT(ru.ru_stime); fprintf(extfile,"%f seconds of CPU time elapsed and %f MB of memory on %s\n\n",telapsed,((float)ru.ru_maxrss)/1024.,hostname); fprintf(extfile,"Event #\t\t\tEvent\t\t\t\t\t\t\t\t\tEvents Counted\n"); fprintf(extfile,"-------\t\t\t-----\t\t\t\t\t\t\t\t\t--------------\n"); if (event0!=-1) fprintf(extfile," %2d \t%-80s\t%14llu\n",event0,label[event0],counter[0]); if (event1!=-1) fprintf(extfile," %2d \t%-80s\t%14llu\n",event1,label[event1],counter[1]); if (mkrpt) { fprintf(extfile,"\nStatistics:\n"); fprintf(extfile,"-----------\n"); if (event[event0]==PERF_INST_DECODER || event[event0]==PERF_INST_RETIRED) fprintf(extfile,"MIPS\t\t\t%14lf\n",1.0e-6*((double)counter[0])/(double)telapsed); else if (event[event1]==PERF_INST_DECODER || event[event1]==PERF_INST_RETIRED) fprintf(extfile,"MIPS\t\t\t%14lf\n",1.0e-6*((double)counter[1])/(double)telapsed); if (event[event0]==PERF_FLOPS || event[event0]==PERF_FP_COMP_OPS_EXE) { fprintf(extfile,"MFLOPS\t\t\t%14lf\n",1.0e-6*((double)counter[0])/(double)telapsed); if (event[event1]==PERF_MUL) fprintf(extfile,"Multiplications/FP op\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DIV) fprintf(extfile,"Divisions/FP op\t\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_INST_DECODER || event[event1]==PERF_INST_RETIRED) fprintf(extfile,"Instructions/FP op\t%14lf\n",((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_CPU_CLK_UNHALTED) fprintf(extfile,"Cycles/FP op\t\t%14lf\n",((double)counter[1])/((double)counter[0])); } if (event[event0]==PERF_CYCLES_DIV_BUSY && event[event1]==PERF_DIV) fprintf(extfile,"Cycles/divide op\t\t%14lf\n",((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DCU_LINES_IN) fprintf(extfile,"L2->L1 memory bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_DCU_LINES_IN) fprintf(extfile,"L2->L1 memory bandwidth\t%14lf MB/s\n",1.6E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"main memory->L2 bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[0])/telapsed); else if (event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"main memory->L2 bandwidth\t%14lf MB/s\n",3.2E-5*((double)counter[1])/telapsed); if (event[event0]==PERF_DATA_MEM_REFS && event[event1]==PERF_DCU_LINES_IN) fprintf(extfile,"L1 data cache hit rate\t%14lf\n",1.-((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DATA_MEM_REFS && event[event0]==PERF_DCU_LINES_IN) fprintf(extfile,"L1 data cache hit rate\t%14lf\n",1.-((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DATA_MEM_REFS && event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"L2 cache hit rate\t\t%14lf\n",1.-((double)counter[1])/((double)counter[0])); else if (event[event1]==PERF_DATA_MEM_REFS && event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"L2 cache hit rate\t\t%14lf\n",1.-((double)counter[0])/((double)counter[1])); if (event[event0]==PERF_DCU_LINES_IN && event[event1]==PERF_L2_LINES_IN) fprintf(extfile,"L1 data cache miss/L2 cache miss\t%14lf\n",((double)counter[0])/((double)counter[1])); else if (event[event1]==PERF_DCU_LINES_IN && event[event0]==PERF_L2_LINES_IN) fprintf(extfile,"L1 data cache miss/L2 cache miss\t%14lf\n",((double)counter[1])/((double)counter[0])); } } exit(0); }