+ All Categories
Home > Documents > Estudio de utilizaci on efectiva de procesadores vectoriales · Universidad de Las Palmas de Gran...

Estudio de utilizaci on efectiva de procesadores vectoriales · Universidad de Las Palmas de Gran...

Date post: 04-Oct-2018
Category:
Upload: dinhthu
View: 216 times
Download: 0 times
Share this document with a friend
15
Universidad de Las Palmas de Gran Canaria Estudio de utilizaci´ on efectiva de procesadores vectoriales Modulo adherido al simulador LauraAut´onGarc´ ıa Tutores: Francisca Quintana Dom´ ınguez Roger Espasa Sans Las Palmas de Gran Canaria, 17 de abril de 2014
Transcript

Universidad de Las Palmas de Gran Canaria

Estudio de utilizacion efectiva deprocesadores vectoriales

Modulo adherido al simulador

Laura Auton Garcıa

Tutores:Francisca Quintana DomınguezRoger Espasa Sans

Las Palmas de Gran Canaria, 17 de abril de 2014

Apendice A

Modulo CORE

1 #ifndef CORE_H

2 #define CORE_H

34 #include "common.h"

5 #include <map >

6 #include <bitset >

78 extern PIN_MUTEX printLock;

910 #ifdef DEBUG_DEP

11 #define LOCK_PRINT(X) \

12 PIN_MutexLock( &printLock ); \

13 cout << X; \

14 PIN_MutexUnlock( &printLock );

15 #else

16 #define LOCK_PRINT(X)

17 #endif

1819 #ifndef KNC_TLB_LVLS

20 #define KNC_TLB_LVLS 3

21 #endif

2223 #ifndef KNC_CACHE_LVLS

24 #define KNC_CACHE_LVLS 3

25 #endif

2627 // ========================================================================

28 // Global footprint with common info for all threads/application

29 // ========================================================================

3031 typedef enum

32 {

33 INS_TYPE_NONVPU ,

34 INS_TYPE_V_VECTOR ,

35 INS_TYPE_V_SCALAR ,

36 INS_TYPE_MEM ,

37 INS_TYPE_NUM

38 }INS_TYPE_t;

3940 typedef struct

41 {

42 bitset <INS_TYPE_NUM > insType;

43 UINT32 latency;

44 UINT32 insSize;

45 UINT32 srcReg [6];

46 UINT32 dstReg [6];

47 string disassemble;

48 UINT64 routine;

49 }INS_FOOT_PRINT_t;

50

1

APENDICE A. MODULO CORE 2

51 typedef map <UINT64 , INS_FOOT_PRINT_t > FOOT_PRINT;

52 extern FOOT_PRINT Footprint;

5354 // ========================================================================

55 // Basic Block state: Tracks basic block detailed instruction breakdown

56 // ========================================================================

5758 typedef struct BBL_STATE_t

59 {

60 // Last level accesed to get the required data

61 INT32 tlbLevelHit;

62 INT32 cacheLevelHit;

6364 // Breakdown of cycles accumulated

65 UINT32 breakdownTLB[KNC_TLB_LVLS ];

66 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

6768 // Sumatory of breakdowns

69 UINT32 cycles;

7071 BBL_STATE_t(INT32 tlbHit , INT32 cacheHit):

72 tlbLevelHit(tlbHit),

73 cacheLevelHit(cacheHit),

74 breakdownTLB (),

75 breakdownCACHE (),

76 cycles (0){}

7778 }BBL_STATE_t;

7980 typedef std::pair <UINT64 , UINT32 > BBL_ENTRY_KEY;

81 typedef map <BBL_ENTRY_KEY , BBL_STATE_t > BBL_STATE;

8283 // ========================================================================

84 // Register File: Tracks register accesses

85 // ========================================================================

8687 typedef struct

88 {

89 // Cycle in which data in register will be available

90 COUNTER cycle;

9192 // Las instruction that wrote the register

93 BBL_ENTRY_KEY PC;

9495 // Breakdown of cycles if load

96 UINT32 breakdownTLB[KNC_TLB_LVLS ];

97 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

9899 }REG_FILE_STATE_t;

100101 typedef map <UINT32 , REG_FILE_STATE_t > REG_FILE;

102103 // ========================================================================

104 // State: lastest state of simulation

105 // ========================================================================

106107 typedef struct STATE

108 {

109 REG_FILE *regFile;

110 BBL_STATE *bbl;

111112 // Memory access breakdown of last instruction of each thread

113 struct MEMORY_STATE{

114 UINT32 breakdownTLB[KNC_TLB_LVLS ];

115 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];

116 MEMORY_STATE (): breakdownTLB (), breakdownCACHE (){};

117 }* memory;

118119 // SHARED

120 COUNTER issue;

APENDICE A. MODULO CORE 3

121 COUNTER wBackMemory;

122123 // Information of last instruction that used pipeline

124 struct LAST_INS{

125 UINT32 tid;

126 BBL_ENTRY_KEY key;

127 LAST_INS(UINT32 tid=0, BBL_ENTRY_KEY key=make_pair (0,0)): tid(tid), key(key)←↩{};

128 }lastInstruction;

129130 STATE(): issue (0), wBackMemory (0);

131132 }STATE;

133134 // ========================================================================

135 // Stats: accumulated stats of simulation. Both by ins and global

136 // ========================================================================

137138 typedef enum

139 {

140 INS_STALL_ISSUE ,

141 INS_STALL_NONVPU ,

142 INS_STALL_V_SCALAR ,

143 INS_STALL_V_VECTOR ,

144 INS_STALL_NUM

145 }INS_STALL_t;

146147 typedef enum

148 {

149 STALLS_ENTRY ,

150 TLB_ENTRY ,

151 CACHE_ENTRY

152 } BREAKDOWN_t;

153154 typedef struct STATS_INS_s

155 {

156 // Bytes loaded from load instructions

157 UINT32 bytesLoaded;

158159 // Sumatory of breakdown

160 UINT32 cycles;

161162 // Breakdown of stalls accumulated

163 COUNTER breakdownSTALLS[INS_STALL_NUM ];

164165 // Breakdown of cycles accumulated

166 COUNTER breakdownTLB[KNC_TLB_LVLS ];

167 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];

168169 STATS_INS_s (): bytesLoaded (0), cycles (0),

170 breakdownSTALLS (),

171 breakdownTLB (),

172 breakdownCACHE (){}

173174 }STATS_INS_s;

175176 typedef map <UINT64 , STATS_INS_s > STATS_INS_t;

177178 typedef struct STATS_GLB_t

179 {

180 // Sumatory breakdown

181 COUNTER cycles;

182183 // Breakdown of stalls accumulated

184 COUNTER breakdownSTALLS[INS_STALL_NUM ];

185186 // Breakdown of cycles accumulated

187 COUNTER breakdownTLB[KNC_TLB_LVLS ];

188 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];

189

APENDICE A. MODULO CORE 4

190 STATS_GLB_t (): cycles (0),

191 breakdownSTALLS (),

192 breakdownTLB (),

193 breakdownCACHE (){}

194195 }STATS_GLB_t;

196197 typedef struct

198 {

199 STATS_INS_t *stats_ins;

200 STATS_GLB_t *stats_glb;

201 } STATS;

202203 // ========================================================================

204 // CORE: Class with the context structures for every core

205 // ========================================================================

206207 class CORE{

208 PIN_MUTEX pipelineLock;

209210 STATE state;

211 STATS stats;

212213 // How many threads

214 UINT32 nThreads;

215 UINT32 coreID;

216217 // Pointers to latency information

218 UINT32 *latencyTLB;

219 UINT32 *latencyCACHE;

220221 void InsertInPipeline(

222 UINT32 tid ,

223 BBL_STATE :: iterator ins );

224225 UINT32 GetCacheLatency( BBL_ENTRY_KEY key );

226227 void DistributeCycles(

228 UINT32 tid ,

229 UINT64 storeLIP ,

230 COUNTER cycles ,

231 BBL_ENTRY_KEY culprit ,

232 bool regStall ,

233 bool memStall ,

234 BBL_ENTRY_KEY currentIP = make_pair (0,0),

235 INT32 regDependency = -1);

236237 void InsertBreakdownStats(

238 UINT32 tid ,

239 STATS_INS_t ::iterator ,

240 UINT32 cycles ,

241 BREAKDOWN_t breakdown ,

242 UINT32 index);

243244 inline UINT32 AdaptThreadID( UINT32 tid )

245 {

246 return tid % nThreads;

247 }

248249 public:

250251 CORE(UINT32 coreID , UINT32 nThreads){

252 // State fields

253 state.regFile = new REG_FILE[ nThreads ];

254 state.bbl = new BBL_STATE[ nThreads ];

255 state.memory = new STATE :: MEMORY_STATE[ nThreads ];

256257 // Stats fields

258 state.stats_ins = new STATS_INS_t[ nThreads ];

259 stats.stats_glb = new STATS_GLB_t[ nThreads ];

APENDICE A. MODULO CORE 5

260261 this ->nThreads = nThreads;

262 }

263264 ~CORE(){

265 // Delete Stats stuff and State stuff

266 delete [] state.regFile;

267 delete [] state.memory;

268269 for (UINT32 t = 0; t < nThreads; t++)

270 DestroyBBLData(t);

271272 delete [] state.bbl;

273 delete [] stats.stats_ins;

274 delete [] stats.stats_glb;

275 }

276277 // Funtions that operate on whole CORE

278 void SetMemorySetup(

279 UINT32 numLevelsTLB ,

280 UINT32 numLevelsCACHE ,

281 UINT32 *latencyTLB ,

282 UINT32 *latencyCACHE);

283284 // Functions that operate on STATE

285 void CreateBBLEntry(

286 UINT32 tid ,

287 UINT64 lip ,

288 INT32 tlbLevelHit ,

289 INT32 cacheLevelHit);

290291 void DestroyBBLData( UINT32 tid );

292293 void DestroyStats( UINT32 tid );

294295 void Pipeline( UINT32 tid , BBInfo *bbl );

296297 COUNTER GetGlobalCycles( UINT32 tid );

298299 string PrintGlobalStats( UINT32 tid );

300301 // Functions that operate on STATS

302 void SetBytesLoaded(

303 UINT32 tid ,

304 UINT64 lip ,

305 UINT32 size);

306307 UINT32 GetBytesLoaded( UINT32 tid , UINT64 lip );

308309 };

310311 extern CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];

312313 inline UINT32 GetCoreID( UINT32 tid , UINT32 ShiftAmount )

314 {

315 UINT32 coreID = (tid >> ShiftAmount);

316 return coreID;

317 }

318319 // ========================================================================

320321 void SaveBBL( UINT32 ThreadID , BBInfo* BB)

322 {

323 ThreadStats[ThreadID ].BB = ThreadStats[ThreadID ]. prevBB;

324 if (bbinfo != NULL) ThreadStats[ThreadID ]. prevBB = bbinfo;

325 }

326327 void Pipeline ( UINT32 ThreadID )

328 {

329 if (dependencyControl && ThreadStats[ThreadID ].BB != NULL

APENDICE A. MODULO CORE 6

330 {

331 for (UINT32 exp = 0; exp < MAX_EXPERIMENTS; exp++ )

332 {

333 UINT32 coreID = 0;

334335 if (MAX_NUM_THREADS > 1 )

336 {

337 coreID = GetCoreID( ThreadID , ShiftAmount[exp ][0] );

338 }

339340 CORE *corePtr = CoreArray[exp][ coreID ];

341 corePtr ->Pipeline( ThreadID , THreadStats[threadID ].BB );

342 corePtr ->DestroyBBLData( ThreadID );

343 }

344 }

345 }

346347 void Instruction(INS ins)

348 {

349 if (Footprint.find(INS_Address(ins)) == Footprint.end())

350 {

351 INS_FOOT_PRINT_t instruction = {}

352 instruction.routine = RTN_Address(INS_Rtn(ins)):

353 instruction.insSize = INS_Size (ins);

354355 if ( dependencyControl )

356 {

357 instruction.disassemble = INS_Disassemble(ins);

358359 // What type of Instruction?

360 if ( INS_IsLoadOp(ins) )

361 {

362 instruction.insType.flip(INS_TYPE_MEM);

363 if (INS_IsVector(ins))

364 {

365 if (INS_IsScalar(ins))

366 instruction.insType.flip(INS_TYPE_V_SCALAR);

367 else

368 instruction.insType.flip(INS_TYPE_V_VECTOR);

369 }

370 else

371 {

372 instruction.insType.flip(INS_TYPE_NONVPU);

373 instruction.latency = NONVPU :: latency;

374 }

375 }

376 else

377 {

378 if (IsMemInstruction(ins))

379 instruction.insType.flip(INS_TYPE_MEM);

380 else if (INS_IsVector(ins))

381 {

382 if (INS_IsScalar(ins))

383 instruction.insType.flip(INS_TYPE_V_SCALAR);

384 else

385 instruction.insType.flip(INS_TYPE_V_VECTOR);

386387 instruction.latendy = GetLatencyByIclass(ins);

388 }

389 else

390 {

391 instruction.insType.flip(INS_TYPE_NONVPU);

392 instruction.latency = NONVPU :: latency;

393 }

394 }

395 }

396 }

397 }

398399 void SplitBlocks ()

APENDICE A. MODULO CORE 7

400 {

401 map <pair <UINT64 , UINT64 >, COUNTER > Worklist;

402 pair <UINT64 , UINT64 > el1 , el2;

403404 for (list <const BBInfo *>:: iterator bi = BBInfoList.begin (); bi != BBInfoList.←↩

end(); bi++)

405 {

406 COUNTER totalCountBBLbyTID = 0;

407408 for (UINT32 tid = 0; tid <= maxThreadID; tid++)

409 {

410 totalCountBBLbyTID += (*bi)->_counter[tid];

411 }

412413 WorkList[pair <UINT64 , UINT64 >((*bi)->StartAddress ,(*bi)->EndAddress)] += ←↩

totalCountBBLbyTID;

414 }

415416 WorkList[pair <UINT64 , UINT64 >(-1, -1)] = 0;

417418 while (WorkList.size() > 1)

419 {

420 el1 = WorkList.begin()->first;

421 el2 = (++ WorkList.begin ())->first;

422423 if (el1.second < el2.first)

424 {

425 BBInfoMap[el1] = WorkList[el1];

426 WOrkList.erase(el1);

427 }

428 else

429 {

430 if (el1.first == el2.first && el1.second < el2.second)

431 {

432 pair <UINT64 ,UINT64 > newel1 = el1;

433 pair <UINT64 ,UINT64 > newel2 = make_pair ((++ Footprint.find(el1.second))->←↩first ,el2.second);

434435 WorkList[newel1] += WorkList[el2];

436 WorkList[newel2] += WorkList[el2];

437438 WorkList.erase(el2);

439 }

440 else if (el1.first < el2.first && ( el1.second > el2.second ||

441 el1.second == el2.second || el1.second < el2.second ))

442 {

443 pair <UINT64 ,UINT64 > newel1 = make_pair(el1.first ,(--Footprint.find(el2.←↩first))->first);

444 pair <UINT64 ,UINT64 > newel2 = make_pair(el2.first , el1.second);

445446 WorkList[newel1] = WorkList[el1];

447 WorkList[newel2] = WorkList[el1];

448449 WorkList.erase(el1);

450 }

451 else

452 {

453 assert (1);

454 }

455 }

456 }

457 }

458459 #endif /* CORE_H */

460461 #include "core.h"

462 #include <sstream >

463464 FOOT_PRINT Footprint;

465 CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];

APENDICE A. MODULO CORE 8

466 PIN_MUTEX printLock;

467468 // ========================================================================

469470 void CORE:: CreateBBLEntry (

471 UINT32 tid ,

472 UINT64 lip ,

473 INT32 tlbLevelHit ,

474 INT32 cacheLevelHit)

475 {

476 UINT32 realIndex = AdaptThreadID(tid);

477478 // First , prepare the structure

479 BBL_ENTRY_KEY key = make_pair(lip ,0);

480 BBL_STATE_t bblInfo(tlbLevelHit , cacheLevelHit);

481482 // When a bbl with cache information , key and breadowns are updated ←↩

accordingly

483 if ( tlbLevelHit != -1 || cacheLevelHit != -1 )

484 {

485 // If split or gather instruction , modify key

486 BBL_STATE :: reverse_iterator cait = sate.bbl[realIndex ]. rbegin ();

487 if (cait != state.bbl[realIndex ].rend() && cait ->first.first == lip)

488 key = make_pair(lip ,cait ->first.second +1);

489490 // Getting cycles from TLB

491492 for (INT32 i = 0; i <= tlbLevelHit; i++)

493 {

494 bblInfo.breakdownTLB[i] += latencyTLB[i];

495 bblInfo.cycles += latencyTLB[i];

496 }

497498 // Getting cycles from the CACHE

499500 bblInfo.cycles += latencyCACHE[cacheLevelHit ];

501 for (INT32 i = 0; i <= cacheLevelHit; i++)

502 {

503 if (!i)

504 bblInfo.breakdownCACHE[i] += latencyCACHE[i];

505 else

506 bblInfo.breakdownCACHE[i] += latencyCACHE[i] - latencyCACHE[i-1];

507 }

508 }

509510 state.bbl[realIndex ]. insert( make_pair(key ,bblInfo) );

511 }

512513 // ========================================================================

514515 void CORE:: DestroyBBLData( UINT32 tid )

516 {

517 UINT32 realIndex = AdaptThreadID(tid);

518 state.bbl[realIndex ].clear ();

519 }

520521 // ========================================================================

522523 void CORE:: InsertBreakdownStats(

524 UINT32 tid ,

525 STATS_INS_t :: iterator entry ,

526 UINT32 cycles ,

527 BREAKDOWN_t breakdown ,

528 UINT32 index)

529 {

530 UINT32 realIndex = AdaptThreadID(tid);

531532 switch(breakdown)

533 {

534 case STALLS_ENTRY:

APENDICE A. MODULO CORE 9

535 entry ->second.breakdownSTALLS[index] += cycles;

536 stats.stats_glb[realIndex ]. breakdownSTALLS[index] += cycles;

537 break;

538539 case CACHE_ENTRY:

540 entry ->second.breakdownCACHE[index] += cycles;

541 stats.stats_glb[realIndex ]. breakdownCACHE[index] += cycles;

542543 case TLB_ENTRY:

544 entry ->second.breakdownTLB[index] += cycles;

545 stats.stats_glb[realIndex ]. breakdownTLB[index] += cycles;

546 }

547548 entry ->second.cycles += cycles;

549 stats.stats_glb[realIndex ]. cycles += cycles;

550551 }

552 }

553554 // ========================================================================

555556 void CORE:: DistributeCycles(

557 UINT32 tid ,

558 UINT64 storeLIP ,

559 COUNTER cycles ,

560 BBL_ENTRY_KEY culprit ,

561 bool regStall ,

562 bool memStall ,

563 BBL_ENTRY_KEY currentIP ,

564 INT32 regDependency )

565 {

566 UINT32 realIndex = AdaptThread(tid);

567 UINT32 storeCycles = 0;

568569 // Lets get sure the storage does exist. If not , create.

570 STATS_INS_t :: iterator storage = stats.stats_ins[realIndex]-find(storeLIP);

571 if (storage == stats.stats_ins[tid].end())

572 {

573 STATS_INS_s statsInfo;

574 storage = stats.stats_ins[tid]. insert(make_pair(storeLIP ,statsInfo)).first;

575 }

576577 if ( !regStall and !memStall )

578 InsertBreakdownStats(tid , storage , cycles , STALLS_ENTRY , INS_STALL_ISSUE);

579580 if ( regStall )

581 {

582 // Get the footprint of the culprit

583 FOOT_PRINT :: iterator culpritInfo = Footprint.find(culprit.first);

584585 // Only if the dependency has nothing to do with pipeline being stalled

586 storeCycles = cycles >= culpritInfo ->second.latency ? culpritInfo ->second.←↩latency : cycles;

587588 if (culpritInfo ->second.insType.test(INS_TYPE_NONVPU))

589 {

590 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_NONVPU);

591 }

592 else if (culpritInfo ->second.insType.test(INS_TYPE_V_SCALAR))

593 {

594 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_SCALAR);

595 }

596 else if (culpritInfo ->second.insType.test(INS_TYPE_V_VECTOR))

597 {

598 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_VECTOR);

599 }

600

APENDICE A. MODULO CORE 10

601 // Remaining cycles

602 cycles -= storeCycles;

603 }

604605 if ( memStall )

606 {

607 // Get the state info of the culprit

608 // To be taken into account: if there is a culprit , the stall is going to be

609 // with an instruction of the same thread (registers are not shared)

610 BBL_STATE :: iterator culpritState = state.bbl[realIndex ].find(culprit);

611612 // FOr special cases

613 UINT32 source = 0;

614615 if ( culpritState == state.bbl[realIndex ].end() || // Different basic block

616 culprit.first > currentIP.first || // Same basic block , different iteration

617 ( culprit.first == currentIP.first // BBL of 1 instruction , maybe splitted

618 && culprit.second >= currentIP.second ))

619 {

620 if (regDependency != -1)

621 source = 1;

622 else

623 source = 2;

624 }

625626 // Start with CACHE

627 for (INT32 level = KNC_CACHE_LVLS; level >= 0 && cycles > 0; level --)

628 {

629 UINT32 sourceCycles;

630 switch(source)

631 {

632 case 1:

633 // from regFile

634 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownCACHE[level];

635 break;

636 case 2:

637 // from last memory stat

638 sourceCycles = state.memory[realIndex ]. breakdownCACHE[level];

639 break;

640 default:

641 sourceCycles = culpritState ->second.breakdownCACHE[level];

642 }

643644 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;

645 InsertBreakdownStats(tid , storage , storeCycles , CACHE_ENTRY , level);

646647 // Remaining

648 cycles -= storeCycles;

649 }

650651 // Follow with TLB

652 for (INT32 level = KNC_TLB_LVLS -1; level >= 0; level --)

653 {

654 UINT32 sourceCycles;

655 switch(source)

656 {

657 case 1:

658 // from regFile

659 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownTLB[level ];

660 break;

661 case 2:

662 // from last memory stat

663 sourceCycles = state.memory[realIndex ]. breakdownTLB[level ];

664 break;

665 default:

666 sourceCycles = culpritState ->second.breakdownTLB[level];

667 }

668

APENDICE A. MODULO CORE 11

669 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;

670 InsertBreakdownStats(tid , storage , storeCycles , TLB_ENTRY , level);

671672 // Remaining

673 cycles -= storeCycles;

674 }

675 }

676 }

677678 // ========================================================================

679680 void CORE:: InsertInPipeline (UINT32 tid , BBL_STATE :: iterator ins)

681 {

682 UINT32 realIndex = AdaptThreadID(tid);

683684 BBL_ENTRY_KEY key = ins ->first;

685686 // Get the instruction footprint

687 INS_FOOT_PRINT_t insInfo = Footprint.find(key.first)->second;

688689 // Key to handle culprit instruction if any

690 BBL_ENTRY_KEY culprit = make_pair (0,0);

691692 // Last register read dependency

693 INT32 regDependency = 0;

694695 // Ideally , when does the instruction enter the pipeline?

696 COUNTER issue = state.issue + 1;

697 COUNTER saveIssue = state.issue;

698699 // Last instruction in pipeline

700 STATE:: LAST_INS lastInstruction = state.lastInstruction;

701702 // ====================================

703 // When are the source registers read?

704 // ====================================

705 if (key.second == 0)

706 {

707 for (UINT32 j = 1; j <= insInfo.srcReg [0]; j++)

708 {

709 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.srcReg[j]);

710711 // If the register is found in RegTable , dependency spoted.

712 if (reg != state.regFile[realIndex ].end())

713 {

714 if (issue < reg ->second.cycle)

715 {

716 culprit = reg ->second.PC;

717 regDependency = reg ->first;

718 issue = max( issue , reg ->second.cycle );

719 }

720 }

721 }

722 }

723724 // ================================================

725 // Was the pipeline frozen by last ins of same TID?

726 // ================================================

727 if ( state.lastInstruction.tid == tid && issue < state.wBackMemory )

728 {

729 culprit = make_pair (0,0);

730 issue = max( Issue , state.wBackMemory );

731 }

732733 // ================================================

734 // State update before unlocking Mutex

735 // ================================================

736 state.issue = issue;

737738 if ( insInfo.insType.test(INS_TYPE_MEM) && ins ->second.cacheLevelHit > 0 )

APENDICE A. MODULO CORE 12

739 state.wBackMemory = issue + ins ->second.cycles;

740 else

741 state.wBackMemory = 0;

742743 state.lastInstruction = STATE:: LAST_INS(tid , key);

744745 PIN_MutexUnlock( &pipelineLock );

746747 // ================================================

748 // Distribution of cycles

749 // ================================================

750 if (saveIssue)

751 {

752 COUNTER cycles = issue - saveIssue;

753 UINT64 storeLIP = key.first;

754755 if (culprit.first)

756 {

757 DistributeCycles(tid , storeLIP , cycles , culprit , true , true , key , ←↩regDependency);

758 }

759 else

760 {

761 if (issue != saveIssue +1)

762 {

763 DistributeCycles(tid , storeLIP , cycles , lastInstruction.key , false , true , ←↩key);

764 }

765 else

766 {

767 DistributeCycles(tid , storeLIP , 1, make_pair (0,0), false , false);

768 }

769 }

770 }

771772 // ================================================

773 // When are the destiny register written?

774 // ================================================

775776 for (UINT32 j = 1; j < insInfo.dstReg [0]; j++)

777 {

778 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.dstReg[j]);

779 if (reg == state.regFile[realIndex ].end())

780 {

781 REG_FILE_STATE_t regInfo;

782 reg = state.regFile[realIndex ]. insert(make_pair(insInfo.dstReg[j],regInfo)).←↩first;

783 }

784 reg ->second.cycle = issue + ins ->second.cycles + insInfo.latency;

785 reg ->second.PC = key;

786787 // Copy breakdown cycles if neccesary

788 if ( insInfo.insType.test(INS_TYPE_MEM) )

789 {

790 // Tlb

791 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)

792 reg ->second.breakdownTLB[level] = ins ->second.breakdownTLB[level];

793 // Cache

794 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)

795 reg ->second.breakdownCACHE[level] = ins ->second.breakdownCACHE[level];

796 }

797798 }

799800 // ================================================

801 // Save last memory access if any

802 // ================================================

803 if (insInfo.insType.test(INS_TYPE_MEM) )

804 {

805 // Tlb

APENDICE A. MODULO CORE 13

806 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)

807 state.memory[realIndex ]. breakdownTLB[level] = ins ->second.breakdownTLB[level←↩];

808 // Cache

809 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)

810 state.memory[realIndex ]. breakdownCACHE[level] = ins ->second.breakdownCACHE[←↩level];

811 }

812 }

813814 // ========================================================================

815 // bbl expected to be != NULL

816817 void CORE:: Pipeline(UINT32 tid , BBInfo *bbl)

818 {

819 UINT32 realIndex = AdaptThreadID(tid);

820821 // ==================================================

822 // Dependencies control

823 // ==================================================

824 // BBL info is updated with no memory instructions (tlbLevelHit -1 / ←↩cacheLevelHit -1)

825 map <UINT64 , INS_FOOT_PRINT_t >:: iterator lastIns = ++ Footprint.find(bbl ->←↩EndAddress);

826827 for(FOOT_PRINT :: iterator it = Footprint.find(bbl ->StartAddress);

828 it != lastIns; it++)

829 CreateBBLEntry(realIndex , it->first , -1, -1);

830831 // Lets travel through all the instructions of the block

832 for (BBL_STATE :: iterator ins = state.bbl[realIndex ].begin ();

833 ins != state.bbl[realIndex ].end(); ins++)

834 {

835 InsertInPipeline(tid ,ins);

836 }

837 }

838839 // ========================================================================

840841 COUNTER CORE:: GetGlobalCycles( UINT32 tid )

842 {

843 UINT32 realIndex = AdaptThreadID(tid);

844 return stats.stats_glb[realIndex ]. cycles;

845 }

846847 // ========================================================================

848849 string CORE:: PrintGlobalStats( UINT32 tid )

850 {

851 UINT32 realIndex = AdaptThreadID(tid);

852 stringstream output;

853854 for (UINT32 stall = 0; stall < INS_STALL_NUM; stall ++)

855 {

856 output << stats.stats_glb[realIndex ]. breakdownSTALLS[stall] << ",";

857 }

858 for (UINT32 level = 0; level < KNC_TLB_LVLS; level ++)

859 {

860 output << stats.stats_glb[realIndex ]. breakdownTLB[level] << ",";

861 }

862 for (UINT32 level = 0; level < KNC_CACHE_LVLS; level ++)

863 {

864 if (level == KNC_CACHE_lvls - 1)

865 output << stats.stats_glb[realIndex ]. breakdownCACHE[level ];

866 else

867 output << stats.stats_glb[realIndex ]. breakdownCACHE[level] << ",";

868 }

869 return output.str();

870 }

871

APENDICE A. MODULO CORE 14

872 // ========================================================================

873874 void CORE:: SetBytesLoaded( UINT32 tid , UINT64 lip , UINT32 size )

875 {

876 UINT32 realIndex = AdaptThreadID(tid);

877878 // As stats is a vector of jmaps , the map for the lip specified

879 // may not exists the first time this lip is encountered for

880 // the thread tid

881882 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);

883884 if (it == stats.stats_ins[realIndex ].end())

885 {

886 STATS_INS_s data;

887 it = stats.stats_ins[realIndex ]. insert(make_pair(lip , data)).first;

888 }

889890 it->second.bytesLoaded += size;

891 }

892893 // ========================================================================

894895 void CORE:: SetMemorySetup(

896 UINT32 numLevelsTLB ,

897 UINT32 numLevelsCACHE ,

898 UINT32 numLevelsCACHE ,

899 UINT32 *latencyTLB ,

900 UINT32 *latencyCACHE)

901 {

902 this ->latencyTLB = latencyTLB;

903 this ->latencyCACHE = latencyCACHE;

904 }

905906 // ========================================================================

907908 UINT32 CORE:: GetBytesLoaded( UINT32 tid , UINT64 lip )

909 {

910 UINT32 realIndex = AdaptThreadID(tid);

911912 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);

913 if (it != stats.stats_ins[realIndex ].end())

914 return stats.stats_ins[realIndex ].find(lip)->second.bytesLoaded;

915 else

916 return 0xffffffff;

917 }

918


Recommended