1
Pipelined Implementation
2
Outline
• Handle Control Hazard
• Special cases
• Suggested Reading 4.5
3
Control Dependence
• Example:loop:
subl %edx, %ebx
jne targ
irmovl $10, %edx
jmp loop
targ:
halt
• The jne instruction create a control dependency– Which instruction will be executed?
4
Branch Misprediction Example
#demo-j.ys0x000: xorl %eax,%eax 0x002: jne t # Not taken0x007: irmovl $1, %eax # Fall through0x00d: nop0x00e: nop0x00f: nop0x010: halt0x011: t: irmovl $3, %edx # Target (Should not execute)0x017: irmovl $4, %ecx # Should not execute0x01d: irmovl $5, %edx # Should not execute
–Should only execute first 7 instructions
5
Select PC
int F_predPC = [
f_icode in {IJXX, ICALL} : f_valC;
1: f_valP;
];
1: 0x000: xorl %eax,%eax2: 0x002: jne T # Not taken3: 0x00e: irmovl $2, %edx # T4: 0x014: irmovl $3, %ebx # T+15: 0x007: irmovl $1, %eax # Fall Through
6
Branch Misprediction Trace
Incorrectly execute two instructions at branch target
1 2 3 4 5 6 7 8 9
F D E MWF D E M
W
F D E M WF D E M W
F D E M WF D E M W
Cycle 5
Excute
e_valE2E_dstE=%edx
MemoryM_Cnd=0M_valA=0x007
DDecodeD_valC=3D_dstE=%ebx
FFetchf_valC1rB%eax
# demo-j.ys0x000: xorl %eax,%eax0x002: jne T # Not taken0x007: irmovl $1, %eax0x00d: halt0x00e: T:irmovl $2, %edx0x014: irmovl $3, %ebx0x01a: halt
7
Select PC
int f_PC = [
#mispredicted branch. Fetch at incremented PC
M_icode == IJXX && !M_Cnd : M_valA;
#completion of RET instruciton
W_icode == IRET : W_valM;
#default: Use predicted value of PC
1: F_predPC
];
8
Return Example#demo-ret.ys0x000: irmovl Stack,%esp # Intialize stack pointer0x006: nop # Avoid hazard on %esp0x007: nop0x008: nop0x009: call p # Procedure call0x00e: irmovl $5,%esi # Return point0x014: halt0x020: .pos 0x200x020: p: nop # procedure0x021: nop0x022: nop0x023: ret0x024: irmovl $1,%eax # Should not be executed0x02a: irmovl $2,%ecx # Should not be executed0x030: irmovl $3,%edx # Should not be executed0x036: irmovl $4,%ebx # Should not be executed0x100: .pos 0x1000x100: Stack: # Stack: Stack pointer
• Require lots of nops to avoid data hazards
1: 0x023: ret2: 0x024: irmovl $1,%eax #Oops3: 0x02a: irmovl $2,%ecx #Oops4: 0x030: irmovl $3,%edx #Oops5: 0x00e: irmovl $5,%esi #Ret point
9
Incorrectly execute 3 instructions following ret
1 2 3 4 5 6 7 8 9
F D E MWF D E M
W
F D E M WF D E M W
F D E M WF D E M WCycle 5
Excute
e_valE2E_dstE=%ecx
Memory
M_valE=1M_dstE=%eax
DDecodeD_valC=3D_dstE=%edx
FFetchf_valC5rB%esi
Write Back
W_ValM=0x00e
Incorrect Return Example
10
Handling Misprediction
# demo-j2.ys0x000: xorl %eax,%eax0x002: jne T # Not taken0x007: irmovl $1, %eax0x00d: halt0x00e: T:nop0x00f: nop0x010 irmovl $2, %edx0x016: irmovl $3, %ebx0x01c: halt
# demo-j.ys0x000: xorl %eax,%eax0x002: jne T # Not taken0x007: irmovl $1, %eax0x00d: halt0x00e: T:irmovl $2, %edx0x014: irmovl $3, %ebx0x01a: halt
11
Handling Misprediction#demo-j.ys
1: 0x000: xorl %eax, %eax
2: 0x002: jne T #Not Taken
3: 0x00e: irmovl $2, %edx # T
4: bubble
5: 0x014: irmovl $3, %ebx # T+1
6: bubble
7: 0x007: irmovl $1, %eax # Fall through
8: 0x00d: halt
• Predict branch as taken– Fetch 2 instructions at target
• Cancel when mispredicted– Detect branch not-taken in execute stage– On following cycle, replace instructions in execute and
decode by bubbles– No side effects have occurred yet
1 2 3 4 5 6 7 8 9
F D E M WF D E M WF D E M WF D E M W
E M W
10
F DE M W
DF
F D E M WF D E M WF D E M WF D E M W
12
Detecting Mispredicted Branch
Condition Trigger
Mispredicted Branch
E_icode = IJXX & !e_Cnd
M
CCCC ALUALU
ALUA
ALUB
ALUfun.
Execute
Cndicode valE valA dstE dstM
E icode ifun valC valA valB dstE dstM srcA srcB
e_Cnd
13
Control for Misprediction
#demo-j.ys
1: 0x000: xorl %eax, %eax
2: 0x002: jne T #Not Taken
3: 0x00e: irmovl $2, %edx # T
4: bubble
5: 0x014: irmovl $3, %ebx # T+1
6: bubble
7: 0x007: irmovl $1, %eax # Fall through
8: 0x00d: halt
1 2 3 4 5 6 7 8 9
F D E M WF D E M WF D E M WF D E M W
E M W
10
F DE M W
DF
F D E M WF D E M WF D E M WF D E M W
Condition F D E M W
Mispredicted Branch
normal
bubble
bubble
normal
normal
14
E
M
W
F
D rB
srcAsrcB
icode valE valM dstE dstM
Bchicode valE valA dstEdstM
icode ifun valC valA valBdstEdstM srcAsrcB
valC valPicode ifun rA
predPC
E_ icode
Pipecontrollogic
E_bubble
D_bubble
15
Return Example
– Previously executed three additional instructions
#demo-retB.ys0x000: irmovl Stack,%esp # Intialize stack pointer0x006: call p # Procedure call0x00b: irmovl $5,%esi # Return point0x011: halt0x020: .pos 0x200x020: p: irmovl $-1,%edi # procedure0x026: ret0x027: irmovl $1,%eax # Should not be executed0x02d: irmovl $2,%ecx # Should not be executed0x033: irmovl $3,%edx # Should not be executed0x039: irmovl $4,%ebx # Should not be executed0x100: .pos 0x1000x100: Stack: # Stack: Stack pointer
#demo_retb1: 0x026: ret2: bubble3: bubble4: bubble5: 0x00b: irmovl $5, %esi #return
16
Correct Return Example
• As ret passes through pipeline, stall at F stage– While in D, E, and M stage– fetch the same instruction after
ret 3 times. • Inject bubble into D stage• Release stall when reach W stage
F D E MWF D E M
W
F D E M WF D E M W
F D E M WF D E M W
•••
1 2 3 4 5 6 7 8 9
Cycle 5
FFetchf_valC5rB%esi
Write Back
W_ValM=0x00b
17
M
D
Registerfile
Registerfile
CCCC ALUALU
rB
dstE dstM
ALUA
ALUB
srcA srcB
ALUfun.
Decode
Execute
A B M
E
Cndicode valE valA dstE dstM
E icode ifun valC valA valB dstE dstM srcA srcB
valC valPicode ifun rA
d_srcBd_srcA
e_Cnd
Sel+FwdA
FwdB
Detecting Return
18
Control for Return
Condition F D E M W
Processing ret stall bubble
normal
normal
normal
Condition Trigger
Processing ret IRET in { D_icode, E_icode, M_icode }
#demo_retb1: 0x026: ret2: bubble3: bubble4: bubble5: 0x00b: irmovl $5, %esi #return
F D E MWF D E M
W
F D E M WF D E M W
F D E M WF D E M W
1 2 3 4 5 6 7 8 9
19
E
M
W
F
D rB
icode valE valM dstE dstM
Cndicode valE valA dstEdstM
icodeifun valC valA valB dstEdstM srcAsrcB
valC valPicodeifun rA
predPC
D_icode
E_icode
M_icode
Pipecontrollogic
D_bubble
F_stall
20
Control Cases
• DetectionCondition Trigger
Processing ret IRET in { D_icode, E_icode, M_icode }
Load/Use Hazard E_icode in { IMRMOVL, IPOPL } && E_dstM in { d_srcA, d_srcB }
Mispredicted Branch
E_icode = IJXX & !e_Cnd
Condition F D E M W
Processing ret stall bubble
normal
normal
normal
Load/Use Hazard stall stall bubble
normal
normal
Mispredicted Branch
normal
bubble
bubble
normal
normal
• Action
21
E
M
W
F
D
CCCC
rB
srcAsrcB
icode valE valM dstE dstM
Cndicode valE valA dstEdstM
icodeifun valC valA valB dstEdstM srcAsrcB
valC valPicodeifun rA
predPC
d_srcBd_srcA
e_Cnd
D_icode
E_icode
M_icode
E_dstM
Pipecontrollogic
D_bubble
D_stall
E_bubble
F_stall
22
Implementing Pipeline Control
• Combinational logic generates pipeline control signals
• Action occurs at start of following cycle
23
Initial Version of Pipeline Control
bool F_stall =# Conditions for a load/use hazardE_icode in { IMRMOVL, IPOPL } && E_dstM in { d_srcA, d_srcB } ||# Stalling at fetch while ret passes through pipelineIRET in { D_icode, E_icode, M_icode };
bool D_stall = # Conditions for a load/use hazardE_icode in { IMRMOVL, IPOPL } && E_dstM in { d_srcA, d_srcB };
24
Initial Version of Pipeline Control
bool D_bubble =# Mispredicted branch(E_icode == IJXX && !e_Bch) ||# Stalling at fetch while ret passes through pipeline IRET in { D_icode, E_icode, M_icode };
bool E_bubble =# Mispredicted branch(E_icode == IJXX && !e_Bch) ||# Load/use hazardE_icode in { IMRMOVL, IPOPL } && E_dstM in { d_srcA, d_srcB};
25
Control Combinations
– Special cases that can arise on same clock cycle
• Combination A– Not-taken branch– ret instruction at branch target
• Combination B– Instruction that reads from memory to %esp– Followed by ret instruction
LoadEUseD
M
Load/use
JXXED
M
Mispredict
JXXED
M
Mispredict
EretD
M
ret1
retEbubbleD
M
ret2
bubbleEbubbleD
retM
ret3
EretD
M
ret1
EretD
M
ret1
retEbubbleD
M
ret2
retEbubbleD
M
ret2
bubbleEbubbleD
retM
ret3
bubbleEbubbleD
retM
ret3
Combination B
Combination A
26
JXXED
M
Mispredict
JXXED
M
Mispredict
EretD
M
ret1
EretD
M
ret1
EretD
M
ret1
Combination A
Condition F D E M W
Processing ret stall bubble normal normal normal
Mispredicted Branch
normal bubble bubble normal normal
Combination stall bubble bubble normal normal
F
memoryInstructionmemory
PCPCincrement
SelectPC
Fetch M_valA
W_valM
f_PC
PC
predPC
Control Combination A
27
Control Combination A
• Should handle as mispredicted branch• Stalls F pipeline register• But PC selection logic will be using M_valA
anyhow
Condition F D E M W
Processing ret stall bubble normal normal normal
Mispredicted Branch
normal bubble bubble normal normal
Combination stall bubble bubble normal normal
28
Control Combination B
• Would attempt to bubble and stall pipeline register D• Signaled by processor as pipeline error
LoadEUseD
M
Load/use
EretD
ME
retD
Mret
EretD
M
Combination B
Condition F D E M W
Processing ret stall bubble normal normal normal
Load/Use Hazard stall stall bubble normal normal
Combination stall bubble + stall
bubble normal normal
29
Handling Control Combination B
• Load/use hazard should get priority• ret instruction should be held in decode
stage for additional cycle
Condition F D E M W
Processing ret stall bubble normal normal normal
Load/Use Hazard stall stall bubble normal normal
Combination stall stall bubble normal normal
30
Corrected Pipeline Control Logic
Condition F D E M W
Processing ret stall bubble normal normal normal
Load/Use Hazard stall stall bubble normal normal
Combination stall stall bubble normal normal
bool D_bubble =# Mispredicted branch(E_icode == IJXX && !e_Cnd) ||# Stalling at fetch while ret passes through pipeline IRET in { D_icode, E_icode, M_icode } # but not condition for a load/use hazard && !(E_icode in { IMRMOVL, IPOPL }
&& E_dstM in { d_srcA, d_srcB });
31
Pipeline Summary
• Data Hazards– Most handled by forwarding
• No performance penalty
– Load/use hazard requires one cycle stall
• Control Hazards– Cancel instructions when detect mispredicted
branch• Two clock cycles wasted
– Stall fetch stage while ret passes through pipeline
• Three clock cycles wasted
32
Pipeline Summary
• Control Combinations– Must analyze carefully– First version had subtle bug
• Only arises with unusual instruction combination