15
Universidad de Las Palmas de Gran Canaria Estudio de utilizaci´ on efectiva de procesadores vectoriales Modulo adherido al simulador LauraAut´onGarc´ ıa Tutores: Francisca Quintana Dom´ ınguez Roger Espasa Sans Las Palmas de Gran Canaria, 17 de abril de 2014

Estudio de utilizaci on efectiva de procesadores vectoriales · Universidad de Las Palmas de Gran Canaria Estudio de utilizaci on efectiva de procesadores vectoriales Modulo adherido

  • Upload
    dinhthu

  • View
    216

  • Download
    0

Embed Size (px)

Citation preview

Universidad de Las Palmas de Gran Canaria

Estudio de utilizacion efectiva deprocesadores vectoriales

Modulo adherido al simulador

Laura Auton Garcıa

Tutores:Francisca Quintana DomınguezRoger Espasa Sans

Las Palmas de Gran Canaria, 17 de abril de 2014

Apendice A

Modulo CORE

1 #ifndef CORE_H

2 #define CORE_H

34 #include "common.h"

5 #include <map >

6 #include <bitset >

78 extern PIN_MUTEX printLock;

910 #ifdef DEBUG_DEP

11 #define LOCK_PRINT(X) \

12 PIN_MutexLock( &printLock ); \

13 cout << X; \

14 PIN_MutexUnlock( &printLock );

15 #else

16 #define LOCK_PRINT(X)

17 #endif

1819 #ifndef KNC_TLB_LVLS

20 #define KNC_TLB_LVLS 3

21 #endif

2223 #ifndef KNC_CACHE_LVLS

24 #define KNC_CACHE_LVLS 3

25 #endif

2627 // ========================================================================

28 // Global footprint with common info for all threads/application

29 // ========================================================================

3031 typedef enum

32 {

33 INS_TYPE_NONVPU ,

34 INS_TYPE_V_VECTOR ,

35 INS_TYPE_V_SCALAR ,

36 INS_TYPE_MEM ,

37 INS_TYPE_NUM

38 }INS_TYPE_t;

3940 typedef struct

41 {

42 bitset <INS_TYPE_NUM > insType;

43 UINT32 latency;

44 UINT32 insSize;

45 UINT32 srcReg [6];

46 UINT32 dstReg [6];

47 string disassemble;

48 UINT64 routine;

49 }INS_FOOT_PRINT_t;

50

1

APENDICE A. MODULO CORE 2

51 typedef map <UINT64 , INS_FOOT_PRINT_t > FOOT_PRINT;

52 extern FOOT_PRINT Footprint;

5354 // ========================================================================

55 // Basic Block state: Tracks basic block detailed instruction breakdown

56 // ========================================================================

5758 typedef struct BBL_STATE_t

59 {

60 // Last level accesed to get the required data

61 INT32 tlbLevelHit;

62 INT32 cacheLevelHit;

6364 // Breakdown of cycles accumulated

65 UINT32 breakdownTLB[KNC_TLB_LVLS ];

66 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

6768 // Sumatory of breakdowns

69 UINT32 cycles;

7071 BBL_STATE_t(INT32 tlbHit , INT32 cacheHit):

72 tlbLevelHit(tlbHit),

73 cacheLevelHit(cacheHit),

74 breakdownTLB (),

75 breakdownCACHE (),

76 cycles (0){}

7778 }BBL_STATE_t;

7980 typedef std::pair <UINT64 , UINT32 > BBL_ENTRY_KEY;

81 typedef map <BBL_ENTRY_KEY , BBL_STATE_t > BBL_STATE;

8283 // ========================================================================

84 // Register File: Tracks register accesses

85 // ========================================================================

8687 typedef struct

88 {

89 // Cycle in which data in register will be available

90 COUNTER cycle;

9192 // Las instruction that wrote the register

93 BBL_ENTRY_KEY PC;

9495 // Breakdown of cycles if load

96 UINT32 breakdownTLB[KNC_TLB_LVLS ];

97 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

9899 }REG_FILE_STATE_t;

100101 typedef map <UINT32 , REG_FILE_STATE_t > REG_FILE;

102103 // ========================================================================

104 // State: lastest state of simulation

105 // ========================================================================

106107 typedef struct STATE

108 {

109 REG_FILE *regFile;

110 BBL_STATE *bbl;

111112 // Memory access breakdown of last instruction of each thread

113 struct MEMORY_STATE{

114 UINT32 breakdownTLB[KNC_TLB_LVLS ];

115 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

116 MEMORY_STATE (): breakdownTLB (), breakdownCACHE (){};

117 }* memory;

118119 // SHARED

120 COUNTER issue;

APENDICE A. MODULO CORE 3

121 COUNTER wBackMemory;

122123 // Information of last instruction that used pipeline

124 struct LAST_INS{

125 UINT32 tid;

126 BBL_ENTRY_KEY key;

127 LAST_INS(UINT32 tid=0, BBL_ENTRY_KEY key=make_pair (0,0)): tid(tid), key(key)←↩{};

128 }lastInstruction;

129130 STATE(): issue (0), wBackMemory (0);

131132 }STATE;

133134 // ========================================================================

135 // Stats: accumulated stats of simulation. Both by ins and global

136 // ========================================================================

137138 typedef enum

139 {

140 INS_STALL_ISSUE ,

141 INS_STALL_NONVPU ,

142 INS_STALL_V_SCALAR ,

143 INS_STALL_V_VECTOR ,

144 INS_STALL_NUM

145 }INS_STALL_t;

146147 typedef enum

148 {

149 STALLS_ENTRY ,

150 TLB_ENTRY ,

151 CACHE_ENTRY

152 } BREAKDOWN_t;

153154 typedef struct STATS_INS_s

155 {

156 // Bytes loaded from load instructions

157 UINT32 bytesLoaded;

158159 // Sumatory of breakdown

160 UINT32 cycles;

161162 // Breakdown of stalls accumulated

163 COUNTER breakdownSTALLS[INS_STALL_NUM ];

164165 // Breakdown of cycles accumulated

166 COUNTER breakdownTLB[KNC_TLB_LVLS ];

167 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];

168169 STATS_INS_s (): bytesLoaded (0), cycles (0),

170 breakdownSTALLS (),

171 breakdownTLB (),

172 breakdownCACHE (){}

173174 }STATS_INS_s;

175176 typedef map <UINT64 , STATS_INS_s > STATS_INS_t;

177178 typedef struct STATS_GLB_t

179 {

180 // Sumatory breakdown

181 COUNTER cycles;

182183 // Breakdown of stalls accumulated

184 COUNTER breakdownSTALLS[INS_STALL_NUM ];

185186 // Breakdown of cycles accumulated

187 COUNTER breakdownTLB[KNC_TLB_LVLS ];

188 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];

189

APENDICE A. MODULO CORE 4

190 STATS_GLB_t (): cycles (0),

191 breakdownSTALLS (),

192 breakdownTLB (),

193 breakdownCACHE (){}

194195 }STATS_GLB_t;

196197 typedef struct

198 {

199 STATS_INS_t *stats_ins;

200 STATS_GLB_t *stats_glb;

201 } STATS;

202203 // ========================================================================

204 // CORE: Class with the context structures for every core

205 // ========================================================================

206207 class CORE{

208 PIN_MUTEX pipelineLock;

209210 STATE state;

211 STATS stats;

212213 // How many threads

214 UINT32 nThreads;

215 UINT32 coreID;

216217 // Pointers to latency information

218 UINT32 *latencyTLB;

219 UINT32 *latencyCACHE;

220221 void InsertInPipeline(

222 UINT32 tid ,

223 BBL_STATE :: iterator ins );

224225 UINT32 GetCacheLatency( BBL_ENTRY_KEY key );

226227 void DistributeCycles(

228 UINT32 tid ,

229 UINT64 storeLIP ,

230 COUNTER cycles ,

231 BBL_ENTRY_KEY culprit ,

232 bool regStall ,

233 bool memStall ,

234 BBL_ENTRY_KEY currentIP = make_pair (0,0),

235 INT32 regDependency = -1);

236237 void InsertBreakdownStats(

238 UINT32 tid ,

239 STATS_INS_t ::iterator ,

240 UINT32 cycles ,

241 BREAKDOWN_t breakdown ,

242 UINT32 index);

243244 inline UINT32 AdaptThreadID( UINT32 tid )

245 {

246 return tid % nThreads;

247 }

248249 public:

250251 CORE(UINT32 coreID , UINT32 nThreads){

252 // State fields

253 state.regFile = new REG_FILE[ nThreads ];

254 state.bbl = new BBL_STATE[ nThreads ];

255 state.memory = new STATE :: MEMORY_STATE[ nThreads ];

256257 // Stats fields

258 state.stats_ins = new STATS_INS_t[ nThreads ];

259 stats.stats_glb = new STATS_GLB_t[ nThreads ];

APENDICE A. MODULO CORE 5

260261 this ->nThreads = nThreads;

262 }

263264 ~CORE(){

265 // Delete Stats stuff and State stuff

266 delete [] state.regFile;

267 delete [] state.memory;

268269 for (UINT32 t = 0; t < nThreads; t++)

270 DestroyBBLData(t);

271272 delete [] state.bbl;

273 delete [] stats.stats_ins;

274 delete [] stats.stats_glb;

275 }

276277 // Funtions that operate on whole CORE

278 void SetMemorySetup(

279 UINT32 numLevelsTLB ,

280 UINT32 numLevelsCACHE ,

281 UINT32 *latencyTLB ,

282 UINT32 *latencyCACHE);

283284 // Functions that operate on STATE

285 void CreateBBLEntry(

286 UINT32 tid ,

287 UINT64 lip ,

288 INT32 tlbLevelHit ,

289 INT32 cacheLevelHit);

290291 void DestroyBBLData( UINT32 tid );

292293 void DestroyStats( UINT32 tid );

294295 void Pipeline( UINT32 tid , BBInfo *bbl );

296297 COUNTER GetGlobalCycles( UINT32 tid );

298299 string PrintGlobalStats( UINT32 tid );

300301 // Functions that operate on STATS

302 void SetBytesLoaded(

303 UINT32 tid ,

304 UINT64 lip ,

305 UINT32 size);

306307 UINT32 GetBytesLoaded( UINT32 tid , UINT64 lip );

308309 };

310311 extern CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];

312313 inline UINT32 GetCoreID( UINT32 tid , UINT32 ShiftAmount )

314 {

315 UINT32 coreID = (tid >> ShiftAmount);

316 return coreID;

317 }

318319 // ========================================================================

320321 void SaveBBL( UINT32 ThreadID , BBInfo* BB)

322 {

323 ThreadStats[ThreadID ].BB = ThreadStats[ThreadID ]. prevBB;

324 if (bbinfo != NULL) ThreadStats[ThreadID ]. prevBB = bbinfo;

325 }

326327 void Pipeline ( UINT32 ThreadID )

328 {

329 if (dependencyControl && ThreadStats[ThreadID ].BB != NULL

APENDICE A. MODULO CORE 6

330 {

331 for (UINT32 exp = 0; exp < MAX_EXPERIMENTS; exp++ )

332 {

333 UINT32 coreID = 0;

334335 if (MAX_NUM_THREADS > 1 )

336 {

337 coreID = GetCoreID( ThreadID , ShiftAmount[exp ][0] );

338 }

339340 CORE *corePtr = CoreArray[exp][ coreID ];

341 corePtr ->Pipeline( ThreadID , THreadStats[threadID ].BB );

342 corePtr ->DestroyBBLData( ThreadID );

343 }

344 }

345 }

346347 void Instruction(INS ins)

348 {

349 if (Footprint.find(INS_Address(ins)) == Footprint.end())

350 {

351 INS_FOOT_PRINT_t instruction = {}

352 instruction.routine = RTN_Address(INS_Rtn(ins)):

353 instruction.insSize = INS_Size (ins);

354355 if ( dependencyControl )

356 {

357 instruction.disassemble = INS_Disassemble(ins);

358359 // What type of Instruction?

360 if ( INS_IsLoadOp(ins) )

361 {

362 instruction.insType.flip(INS_TYPE_MEM);

363 if (INS_IsVector(ins))

364 {

365 if (INS_IsScalar(ins))

366 instruction.insType.flip(INS_TYPE_V_SCALAR);

367 else

368 instruction.insType.flip(INS_TYPE_V_VECTOR);

369 }

370 else

371 {

372 instruction.insType.flip(INS_TYPE_NONVPU);

373 instruction.latency = NONVPU :: latency;

374 }

375 }

376 else

377 {

378 if (IsMemInstruction(ins))

379 instruction.insType.flip(INS_TYPE_MEM);

380 else if (INS_IsVector(ins))

381 {

382 if (INS_IsScalar(ins))

383 instruction.insType.flip(INS_TYPE_V_SCALAR);

384 else

385 instruction.insType.flip(INS_TYPE_V_VECTOR);

386387 instruction.latendy = GetLatencyByIclass(ins);

388 }

389 else

390 {

391 instruction.insType.flip(INS_TYPE_NONVPU);

392 instruction.latency = NONVPU :: latency;

393 }

394 }

395 }

396 }

397 }

398399 void SplitBlocks ()

APENDICE A. MODULO CORE 7

400 {

401 map <pair <UINT64 , UINT64 >, COUNTER > Worklist;

402 pair <UINT64 , UINT64 > el1 , el2;

403404 for (list <const BBInfo *>:: iterator bi = BBInfoList.begin (); bi != BBInfoList.←↩

end(); bi++)

405 {

406 COUNTER totalCountBBLbyTID = 0;

407408 for (UINT32 tid = 0; tid <= maxThreadID; tid++)

409 {

410 totalCountBBLbyTID += (*bi)->_counter[tid];

411 }

412413 WorkList[pair <UINT64 , UINT64 >((*bi)->StartAddress ,(*bi)->EndAddress)] += ←↩

totalCountBBLbyTID;

414 }

415416 WorkList[pair <UINT64 , UINT64 >(-1, -1)] = 0;

417418 while (WorkList.size() > 1)

419 {

420 el1 = WorkList.begin()->first;

421 el2 = (++ WorkList.begin ())->first;

422423 if (el1.second < el2.first)

424 {

425 BBInfoMap[el1] = WorkList[el1];

426 WOrkList.erase(el1);

427 }

428 else

429 {

430 if (el1.first == el2.first && el1.second < el2.second)

431 {

432 pair <UINT64 ,UINT64 > newel1 = el1;

433 pair <UINT64 ,UINT64 > newel2 = make_pair ((++ Footprint.find(el1.second))->←↩first ,el2.second);

434435 WorkList[newel1] += WorkList[el2];

436 WorkList[newel2] += WorkList[el2];

437438 WorkList.erase(el2);

439 }

440 else if (el1.first < el2.first && ( el1.second > el2.second ||

441 el1.second == el2.second || el1.second < el2.second ))

442 {

443 pair <UINT64 ,UINT64 > newel1 = make_pair(el1.first ,(--Footprint.find(el2.←↩first))->first);

444 pair <UINT64 ,UINT64 > newel2 = make_pair(el2.first , el1.second);

445446 WorkList[newel1] = WorkList[el1];

447 WorkList[newel2] = WorkList[el1];

448449 WorkList.erase(el1);

450 }

451 else

452 {

453 assert (1);

454 }

455 }

456 }

457 }

458459 #endif /* CORE_H */

460461 #include "core.h"

462 #include <sstream >

463464 FOOT_PRINT Footprint;

465 CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];

APENDICE A. MODULO CORE 8

466 PIN_MUTEX printLock;

467468 // ========================================================================

469470 void CORE:: CreateBBLEntry (

471 UINT32 tid ,

472 UINT64 lip ,

473 INT32 tlbLevelHit ,

474 INT32 cacheLevelHit)

475 {

476 UINT32 realIndex = AdaptThreadID(tid);

477478 // First , prepare the structure

479 BBL_ENTRY_KEY key = make_pair(lip ,0);

480 BBL_STATE_t bblInfo(tlbLevelHit , cacheLevelHit);

481482 // When a bbl with cache information , key and breadowns are updated ←↩

accordingly

483 if ( tlbLevelHit != -1 || cacheLevelHit != -1 )

484 {

485 // If split or gather instruction , modify key

486 BBL_STATE :: reverse_iterator cait = sate.bbl[realIndex ]. rbegin ();

487 if (cait != state.bbl[realIndex ].rend() && cait ->first.first == lip)

488 key = make_pair(lip ,cait ->first.second +1);

489490 // Getting cycles from TLB

491492 for (INT32 i = 0; i <= tlbLevelHit; i++)

493 {

494 bblInfo.breakdownTLB[i] += latencyTLB[i];

495 bblInfo.cycles += latencyTLB[i];

496 }

497498 // Getting cycles from the CACHE

499500 bblInfo.cycles += latencyCACHE[cacheLevelHit ];

501 for (INT32 i = 0; i <= cacheLevelHit; i++)

502 {

503 if (!i)

504 bblInfo.breakdownCACHE[i] += latencyCACHE[i];

505 else

506 bblInfo.breakdownCACHE[i] += latencyCACHE[i] - latencyCACHE[i-1];

507 }

508 }

509510 state.bbl[realIndex ]. insert( make_pair(key ,bblInfo) );

511 }

512513 // ========================================================================

514515 void CORE:: DestroyBBLData( UINT32 tid )

516 {

517 UINT32 realIndex = AdaptThreadID(tid);

518 state.bbl[realIndex ].clear ();

519 }

520521 // ========================================================================

522523 void CORE:: InsertBreakdownStats(

524 UINT32 tid ,

525 STATS_INS_t :: iterator entry ,

526 UINT32 cycles ,

527 BREAKDOWN_t breakdown ,

528 UINT32 index)

529 {

530 UINT32 realIndex = AdaptThreadID(tid);

531532 switch(breakdown)

533 {

534 case STALLS_ENTRY:

APENDICE A. MODULO CORE 9

535 entry ->second.breakdownSTALLS[index] += cycles;

536 stats.stats_glb[realIndex ]. breakdownSTALLS[index] += cycles;

537 break;

538539 case CACHE_ENTRY:

540 entry ->second.breakdownCACHE[index] += cycles;

541 stats.stats_glb[realIndex ]. breakdownCACHE[index] += cycles;

542543 case TLB_ENTRY:

544 entry ->second.breakdownTLB[index] += cycles;

545 stats.stats_glb[realIndex ]. breakdownTLB[index] += cycles;

546 }

547548 entry ->second.cycles += cycles;

549 stats.stats_glb[realIndex ]. cycles += cycles;

550551 }

552 }

553554 // ========================================================================

555556 void CORE:: DistributeCycles(

557 UINT32 tid ,

558 UINT64 storeLIP ,

559 COUNTER cycles ,

560 BBL_ENTRY_KEY culprit ,

561 bool regStall ,

562 bool memStall ,

563 BBL_ENTRY_KEY currentIP ,

564 INT32 regDependency )

565 {

566 UINT32 realIndex = AdaptThread(tid);

567 UINT32 storeCycles = 0;

568569 // Lets get sure the storage does exist. If not , create.

570 STATS_INS_t :: iterator storage = stats.stats_ins[realIndex]-find(storeLIP);

571 if (storage == stats.stats_ins[tid].end())

572 {

573 STATS_INS_s statsInfo;

574 storage = stats.stats_ins[tid]. insert(make_pair(storeLIP ,statsInfo)).first;

575 }

576577 if ( !regStall and !memStall )

578 InsertBreakdownStats(tid , storage , cycles , STALLS_ENTRY , INS_STALL_ISSUE);

579580 if ( regStall )

581 {

582 // Get the footprint of the culprit

583 FOOT_PRINT :: iterator culpritInfo = Footprint.find(culprit.first);

584585 // Only if the dependency has nothing to do with pipeline being stalled

586 storeCycles = cycles >= culpritInfo ->second.latency ? culpritInfo ->second.←↩latency : cycles;

587588 if (culpritInfo ->second.insType.test(INS_TYPE_NONVPU))

589 {

590 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_NONVPU);

591 }

592 else if (culpritInfo ->second.insType.test(INS_TYPE_V_SCALAR))

593 {

594 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_SCALAR);

595 }

596 else if (culpritInfo ->second.insType.test(INS_TYPE_V_VECTOR))

597 {

598 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_VECTOR);

599 }

600

APENDICE A. MODULO CORE 10

601 // Remaining cycles

602 cycles -= storeCycles;

603 }

604605 if ( memStall )

606 {

607 // Get the state info of the culprit

608 // To be taken into account: if there is a culprit , the stall is going to be

609 // with an instruction of the same thread (registers are not shared)

610 BBL_STATE :: iterator culpritState = state.bbl[realIndex ].find(culprit);

611612 // FOr special cases

613 UINT32 source = 0;

614615 if ( culpritState == state.bbl[realIndex ].end() || // Different basic block

616 culprit.first > currentIP.first || // Same basic block , different iteration

617 ( culprit.first == currentIP.first // BBL of 1 instruction , maybe splitted

618 && culprit.second >= currentIP.second ))

619 {

620 if (regDependency != -1)

621 source = 1;

622 else

623 source = 2;

624 }

625626 // Start with CACHE

627 for (INT32 level = KNC_CACHE_LVLS; level >= 0 && cycles > 0; level --)

628 {

629 UINT32 sourceCycles;

630 switch(source)

631 {

632 case 1:

633 // from regFile

634 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownCACHE[level];

635 break;

636 case 2:

637 // from last memory stat

638 sourceCycles = state.memory[realIndex ]. breakdownCACHE[level];

639 break;

640 default:

641 sourceCycles = culpritState ->second.breakdownCACHE[level];

642 }

643644 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;

645 InsertBreakdownStats(tid , storage , storeCycles , CACHE_ENTRY , level);

646647 // Remaining

648 cycles -= storeCycles;

649 }

650651 // Follow with TLB

652 for (INT32 level = KNC_TLB_LVLS -1; level >= 0; level --)

653 {

654 UINT32 sourceCycles;

655 switch(source)

656 {

657 case 1:

658 // from regFile

659 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownTLB[level ];

660 break;

661 case 2:

662 // from last memory stat

663 sourceCycles = state.memory[realIndex ]. breakdownTLB[level ];

664 break;

665 default:

666 sourceCycles = culpritState ->second.breakdownTLB[level];

667 }

668

APENDICE A. MODULO CORE 11

669 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;

670 InsertBreakdownStats(tid , storage , storeCycles , TLB_ENTRY , level);

671672 // Remaining

673 cycles -= storeCycles;

674 }

675 }

676 }

677678 // ========================================================================

679680 void CORE:: InsertInPipeline (UINT32 tid , BBL_STATE :: iterator ins)

681 {

682 UINT32 realIndex = AdaptThreadID(tid);

683684 BBL_ENTRY_KEY key = ins ->first;

685686 // Get the instruction footprint

687 INS_FOOT_PRINT_t insInfo = Footprint.find(key.first)->second;

688689 // Key to handle culprit instruction if any

690 BBL_ENTRY_KEY culprit = make_pair (0,0);

691692 // Last register read dependency

693 INT32 regDependency = 0;

694695 // Ideally , when does the instruction enter the pipeline?

696 COUNTER issue = state.issue + 1;

697 COUNTER saveIssue = state.issue;

698699 // Last instruction in pipeline

700 STATE:: LAST_INS lastInstruction = state.lastInstruction;

701702 // ====================================

703 // When are the source registers read?

704 // ====================================

705 if (key.second == 0)

706 {

707 for (UINT32 j = 1; j <= insInfo.srcReg [0]; j++)

708 {

709 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.srcReg[j]);

710711 // If the register is found in RegTable , dependency spoted.

712 if (reg != state.regFile[realIndex ].end())

713 {

714 if (issue < reg ->second.cycle)

715 {

716 culprit = reg ->second.PC;

717 regDependency = reg ->first;

718 issue = max( issue , reg ->second.cycle );

719 }

720 }

721 }

722 }

723724 // ================================================

725 // Was the pipeline frozen by last ins of same TID?

726 // ================================================

727 if ( state.lastInstruction.tid == tid && issue < state.wBackMemory )

728 {

729 culprit = make_pair (0,0);

730 issue = max( Issue , state.wBackMemory );

731 }

732733 // ================================================

734 // State update before unlocking Mutex

735 // ================================================

736 state.issue = issue;

737738 if ( insInfo.insType.test(INS_TYPE_MEM) && ins ->second.cacheLevelHit > 0 )

APENDICE A. MODULO CORE 12

739 state.wBackMemory = issue + ins ->second.cycles;

740 else

741 state.wBackMemory = 0;

742743 state.lastInstruction = STATE:: LAST_INS(tid , key);

744745 PIN_MutexUnlock( &pipelineLock );

746747 // ================================================

748 // Distribution of cycles

749 // ================================================

750 if (saveIssue)

751 {

752 COUNTER cycles = issue - saveIssue;

753 UINT64 storeLIP = key.first;

754755 if (culprit.first)

756 {

757 DistributeCycles(tid , storeLIP , cycles , culprit , true , true , key , ←↩regDependency);

758 }

759 else

760 {

761 if (issue != saveIssue +1)

762 {

763 DistributeCycles(tid , storeLIP , cycles , lastInstruction.key , false , true , ←↩key);

764 }

765 else

766 {

767 DistributeCycles(tid , storeLIP , 1, make_pair (0,0), false , false);

768 }

769 }

770 }

771772 // ================================================

773 // When are the destiny register written?

774 // ================================================

775776 for (UINT32 j = 1; j < insInfo.dstReg [0]; j++)

777 {

778 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.dstReg[j]);

779 if (reg == state.regFile[realIndex ].end())

780 {

781 REG_FILE_STATE_t regInfo;

782 reg = state.regFile[realIndex ]. insert(make_pair(insInfo.dstReg[j],regInfo)).←↩first;

783 }

784 reg ->second.cycle = issue + ins ->second.cycles + insInfo.latency;

785 reg ->second.PC = key;

786787 // Copy breakdown cycles if neccesary

788 if ( insInfo.insType.test(INS_TYPE_MEM) )

789 {

790 // Tlb

791 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)

792 reg ->second.breakdownTLB[level] = ins ->second.breakdownTLB[level];

793 // Cache

794 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)

795 reg ->second.breakdownCACHE[level] = ins ->second.breakdownCACHE[level];

796 }

797798 }

799800 // ================================================

801 // Save last memory access if any

802 // ================================================

803 if (insInfo.insType.test(INS_TYPE_MEM) )

804 {

805 // Tlb

APENDICE A. MODULO CORE 13

806 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)

807 state.memory[realIndex ]. breakdownTLB[level] = ins ->second.breakdownTLB[level←↩];

808 // Cache

809 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)

810 state.memory[realIndex ]. breakdownCACHE[level] = ins ->second.breakdownCACHE[←↩level];

811 }

812 }

813814 // ========================================================================

815 // bbl expected to be != NULL

816817 void CORE:: Pipeline(UINT32 tid , BBInfo *bbl)

818 {

819 UINT32 realIndex = AdaptThreadID(tid);

820821 // ==================================================

822 // Dependencies control

823 // ==================================================

824 // BBL info is updated with no memory instructions (tlbLevelHit -1 / ←↩cacheLevelHit -1)

825 map <UINT64 , INS_FOOT_PRINT_t >:: iterator lastIns = ++ Footprint.find(bbl ->←↩EndAddress);

826827 for(FOOT_PRINT :: iterator it = Footprint.find(bbl ->StartAddress);

828 it != lastIns; it++)

829 CreateBBLEntry(realIndex , it->first , -1, -1);

830831 // Lets travel through all the instructions of the block

832 for (BBL_STATE :: iterator ins = state.bbl[realIndex ].begin ();

833 ins != state.bbl[realIndex ].end(); ins++)

834 {

835 InsertInPipeline(tid ,ins);

836 }

837 }

838839 // ========================================================================

840841 COUNTER CORE:: GetGlobalCycles( UINT32 tid )

842 {

843 UINT32 realIndex = AdaptThreadID(tid);

844 return stats.stats_glb[realIndex ]. cycles;

845 }

846847 // ========================================================================

848849 string CORE:: PrintGlobalStats( UINT32 tid )

850 {

851 UINT32 realIndex = AdaptThreadID(tid);

852 stringstream output;

853854 for (UINT32 stall = 0; stall < INS_STALL_NUM; stall ++)

855 {

856 output << stats.stats_glb[realIndex ]. breakdownSTALLS[stall] << ",";

857 }

858 for (UINT32 level = 0; level < KNC_TLB_LVLS; level ++)

859 {

860 output << stats.stats_glb[realIndex ]. breakdownTLB[level] << ",";

861 }

862 for (UINT32 level = 0; level < KNC_CACHE_LVLS; level ++)

863 {

864 if (level == KNC_CACHE_lvls - 1)

865 output << stats.stats_glb[realIndex ]. breakdownCACHE[level ];

866 else

867 output << stats.stats_glb[realIndex ]. breakdownCACHE[level] << ",";

868 }

869 return output.str();

870 }

871

APENDICE A. MODULO CORE 14

872 // ========================================================================

873874 void CORE:: SetBytesLoaded( UINT32 tid , UINT64 lip , UINT32 size )

875 {

876 UINT32 realIndex = AdaptThreadID(tid);

877878 // As stats is a vector of jmaps , the map for the lip specified

879 // may not exists the first time this lip is encountered for

880 // the thread tid

881882 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);

883884 if (it == stats.stats_ins[realIndex ].end())

885 {

886 STATS_INS_s data;

887 it = stats.stats_ins[realIndex ]. insert(make_pair(lip , data)).first;

888 }

889890 it->second.bytesLoaded += size;

891 }

892893 // ========================================================================

894895 void CORE:: SetMemorySetup(

896 UINT32 numLevelsTLB ,

897 UINT32 numLevelsCACHE ,

898 UINT32 numLevelsCACHE ,

899 UINT32 *latencyTLB ,

900 UINT32 *latencyCACHE)

901 {

902 this ->latencyTLB = latencyTLB;

903 this ->latencyCACHE = latencyCACHE;

904 }

905906 // ========================================================================

907908 UINT32 CORE:: GetBytesLoaded( UINT32 tid , UINT64 lip )

909 {

910 UINT32 realIndex = AdaptThreadID(tid);

911912 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);

913 if (it != stats.stats_ins[realIndex ].end())

914 return stats.stats_ins[realIndex ].find(lip)->second.bytesLoaded;

915 else

916 return 0xffffffff;

917 }

918